From e5ab012c3271990e8457055c25cafddc1ae8aa6b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Wed, 20 Feb 2013 16:15:36 +0100 Subject: nohz: Make tick_nohz_irq_exit() irq safe As it stands, irq_exit() may or may not be called with irqs disabled, depending on __ARCH_IRQ_EXIT_IRQS_DISABLED that the arch can define. It makes tick_nohz_irq_exit() unsafe. For example two interrupts can race in tick_nohz_stop_sched_tick(): the inner most one computes the expiring time on top of the timer list, then it's interrupted right before reprogramming the clock. The new interrupt enqueues a new timer list timer, it reprogram the clock to take it into account and it exits. The CPUs resumes the inner most interrupt and performs the clock reprogramming without considering the new timer list timer. This regression has been introduced by: 280f06774afedf849f0b34248ed6aff57d0f6908 ("nohz: Separate out irq exit and idle loop dyntick logic") Let's fix it right now with the appropriate protections. A saner long term solution will be to remove __ARCH_IRQ_EXIT_IRQS_DISABLED and mandate that irq_exit() is called with interrupts disabled. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Linus Torvalds <torvalds@linuxfoundation.org> Cc: <stable@vger.kernel.org> #v3.2+ Link: http://lkml.kernel.org/r/1361373336-11337-1-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/time/tick-sched.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 314b9ee07edf..520592ab6aa4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -565,14 +565,19 @@ void tick_nohz_idle_enter(void) */ void tick_nohz_irq_exit(void) { + unsigned long flags; struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); if (!ts->inidle) return; - /* Cancel the timer because CPU already waken up from the C-states*/ + local_irq_save(flags); + + /* Cancel the timer because CPU already waken up from the C-states */ menu_hrtimer_cancel(); __tick_nohz_idle_enter(ts); + + local_irq_restore(flags); } /** -- cgit v1.2.3 From 74eed0163d0def3fce27228d9ccf3d36e207b286 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Wed, 20 Feb 2013 22:00:48 +0100 Subject: irq: Ensure irq_exit() code runs with interrupts disabled We had already a few problems with code called from irq_exit() when interrupted from a nesting interrupt. This can happen on architectures which do not define __ARCH_IRQ_EXIT_IRQS_DISABLED. __ARCH_IRQ_EXIT_IRQS_DISABLED should go away and we want to make it mandatory to call irq_exit() with interrupts disabled. As a temporary protection disable interrupts for those architectures which do not define __ARCH_IRQ_EXIT_IRQS_DISABLED and add a WARN_ONCE when an architecture which defines __ARCH_IRQ_EXIT_IRQS_DISABLED calls irq_exit() with interrupts enabled. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linuxfoundation.org> Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1302202155320.22263@ionos --- kernel/softirq.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index f5cc25f147a6..f2a934673008 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -341,6 +341,14 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED + unsigned long flags; + + local_irq_save(flags); +#else + WARN_ON_ONCE(!irqs_disabled()); +#endif + account_irq_exit_time(current); trace_hardirq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); @@ -354,6 +362,9 @@ void irq_exit(void) #endif rcu_irq_exit(); sched_preempt_enable_no_resched(); +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED + local_irq_restore(flags); +#endif } /* -- cgit v1.2.3 From facd8b80c67a3cf64a467c4a2ac5fb31f2e6745b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 21 Feb 2013 18:17:42 +0100 Subject: irq: Sanitize invoke_softirq With the irq protection in irq_exit, we can remove the #ifdeffery and the bh_disable/enable dance in invoke_softirq() Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linuxfoundation.org> Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1302202155320.22263@ionos --- kernel/softirq.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index f2a934673008..24a921bcf04f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -322,18 +322,10 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) { -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED + if (!force_irqthreads) __do_softirq(); -#else - do_softirq(); -#endif - } else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + else wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } } /* -- cgit v1.2.3 From af7bdbafe3812af406ce07631effd2b96aae2dba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 21 Feb 2013 18:21:30 +0100 Subject: Revert "nohz: Make tick_nohz_irq_exit() irq safe" This reverts commit 351429b2e62b6545bb10c756686393f29ba268a1. The extra local_irq_save() is not longer needed as the call site now always calls with interrupts disabled. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Linus Torvalds <torvalds@linuxfoundation.org> --- kernel/time/tick-sched.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 520592ab6aa4..314b9ee07edf 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -565,19 +565,14 @@ void tick_nohz_idle_enter(void) */ void tick_nohz_irq_exit(void) { - unsigned long flags; struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); if (!ts->inidle) return; - local_irq_save(flags); - - /* Cancel the timer because CPU already waken up from the C-states */ + /* Cancel the timer because CPU already waken up from the C-states*/ menu_hrtimer_cancel(); __tick_nohz_idle_enter(ts); - - local_irq_restore(flags); } /** -- cgit v1.2.3 From 4d4c4e24cf48400a24d33feffc7cca4f4e8cabe1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Fri, 22 Feb 2013 00:05:07 +0100 Subject: irq: Remove IRQ_EXIT_OFFSET workaround The IRQ_EXIT_OFFSET trick was used to make sure the irq doesn't get preempted after we substract the HARDIRQ_OFFSET until we are entirely done with any code in irq_exit(). This workaround was necessary because some archs may call irq_exit() with irqs enabled and there is still some code in the end of this function that is not covered by the HARDIRQ_OFFSET but want to stay non-preemptible. Now that irq are always disabled in irq_exit(), the whole code is guaranteed not to be preempted. We can thus remove this hack. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/linux/hardirq.h | 2 -- kernel/softirq.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 29eb805ea4a6..c1d6555d2567 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -118,10 +118,8 @@ #ifdef CONFIG_PREEMPT_COUNT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) -# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else # define preemptible() 0 -# define IRQ_EXIT_OFFSET HARDIRQ_OFFSET #endif #if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS) diff --git a/kernel/softirq.c b/kernel/softirq.c index 24a921bcf04f..f42ff97e1f8f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -343,7 +343,7 @@ void irq_exit(void) account_irq_exit_time(current); trace_hardirq_exit(); - sub_preempt_count(IRQ_EXIT_OFFSET); + sub_preempt_count(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); @@ -353,7 +353,6 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - sched_preempt_enable_no_resched(); #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED local_irq_restore(flags); #endif -- cgit v1.2.3 From 46c498c2cdee5efe44f617bcd4f388179be36115 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Tue, 26 Feb 2013 18:44:33 +0100 Subject: stop_machine: Mark per cpu stopper enabled early commit 14e568e78 (stop_machine: Use smpboot threads) introduced the following regression: Before this commit the stopper enabled bit was set in the online notifier. CPU0 CPU1 cpu_up cpu online hotplug_notifier(ONLINE) stopper(CPU1)->enabled = true; ... stop_machine() The conversion to smpboot threads moved the enablement to the wakeup path of the parked thread. The majority of users seem to have the following working order: CPU0 CPU1 cpu_up cpu online unpark_threads() wakeup(stopper[CPU1]) .... stopper thread runs stopper(CPU1)->enabled = true; stop_machine() But Konrad and Sander have observed: CPU0 CPU1 cpu_up cpu online unpark_threads() wakeup(stopper[CPU1]) .... stop_machine() stopper thread runs stopper(CPU1)->enabled = true; Now the stop machinery kicks CPU0 into the stop loop, where it gets stuck forever because the queue code saw stopper(CPU1)->enabled == false, so CPU0 waits for CPU1 to enter stomp_machine, but the CPU1 stopper work got discarded due to enabled == false. Add a pre_unpark function to the smpboot thread descriptor and call it before waking the thread. This fixes the problem at hand, but the stop_machine code should be more robust. The stopper->enabled flag smells fishy at best. Thanks to Konrad for going through a loop of debug patches and providing the information to decode this issue. Reported-and-tested-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> Reported-and-tested-by: Sander Eikelenboom <linux@eikelenboom.it> Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: Rusty Russell <rusty@rustcorp.com.au> Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1302261843240.22263@ionos Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/smpboot.h | 4 ++++ kernel/smpboot.c | 2 ++ kernel/stop_machine.c | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h index c65dee059913..13e929679550 100644 --- a/include/linux/smpboot.h +++ b/include/linux/smpboot.h @@ -24,6 +24,9 @@ struct smpboot_thread_data; * parked (cpu offline) * @unpark: Optional unpark function, called when the thread is * unparked (cpu online) + * @pre_unpark: Optional unpark function, called before the thread is + * unparked (cpu online). This is not guaranteed to be + * called on the target cpu of the thread. Careful! * @selfparking: Thread is not parked by the park function. * @thread_comm: The base name of the thread */ @@ -37,6 +40,7 @@ struct smp_hotplug_thread { void (*cleanup)(unsigned int cpu, bool online); void (*park)(unsigned int cpu); void (*unpark)(unsigned int cpu); + void (*pre_unpark)(unsigned int cpu); bool selfparking; const char *thread_comm; }; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d4abac261779..8eaed9aa9cf0 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -209,6 +209,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + if (ht->pre_unpark) + ht->pre_unpark(cpu); kthread_unpark(tsk); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 95d178c62d5a..c09f2955ae30 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = { .create = cpu_stop_create, .setup = cpu_stop_unpark, .park = cpu_stop_park, - .unpark = cpu_stop_unpark, + .pre_unpark = cpu_stop_unpark, .selfparking = true, }; -- cgit v1.2.3 From db05021d49a994ee40a9735d9c3cb0060c9babb8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Wed, 27 Feb 2013 21:48:09 -0500 Subject: ftrace: Update the kconfig for DYNAMIC_FTRACE The prompt to enable DYNAMIC_FTRACE (the ability to nop and enable function tracing at run time) had a confusing statement: "enable/disable ftrace tracepoints dynamically" This was written before tracepoints were added to the kernel, but now that tracepoints have been added, this is very confusing and has confused people enough to give wrong information during presentations. Not only that, I looked at the help text, and it still references that dreaded daemon that use to wake up once a second to update the nop locations and brick NICs, that hasn't been around for over five years. Time to bring the text up to the current decade. Cc: stable@vger.kernel.org Reported-by: Ezequiel Garcia <elezegarcia@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/Kconfig | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 36567564e221..b516a8e19d51 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -429,24 +429,28 @@ config PROBE_EVENTS def_bool n config DYNAMIC_FTRACE - bool "enable/disable ftrace tracepoints dynamically" + bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replace them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. - config DYNAMIC_FTRACE_WITH_REGS def_bool y depends on DYNAMIC_FTRACE -- cgit v1.2.3 From 4cd5d1115c2f752ca94a0eb461b36d88fb37ed1e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Thu, 28 Feb 2013 20:00:43 +0100 Subject: irq: Don't re-enable interrupts at the end of irq_exit Commit 74eed0163d0def3fce27228d9ccf3d36e207b286 "irq: Ensure irq_exit() code runs with interrupts disabled" restore interrupts flags in the end of irq_exit() for archs that don't define __ARCH_IRQ_EXIT_IRQS_DISABLED. However always returning from irq_exit() with interrupts disabled should not be a problem for these archs. Prior to this commit this was already happening anytime we processed pending softirqs anyway. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/softirq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index f42ff97e1f8f..dce38fac4f32 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -334,9 +334,7 @@ static inline void invoke_softirq(void) void irq_exit(void) { #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED - unsigned long flags; - - local_irq_save(flags); + local_irq_disable(); #else WARN_ON_ONCE(!irqs_disabled()); #endif @@ -353,9 +351,6 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); -#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED - local_irq_restore(flags); -#endif } /* -- cgit v1.2.3 From f5faa0774e07eada85b0c55ec789b3f337d01412 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Tue, 19 Feb 2013 12:17:02 -0800 Subject: workqueue: use %current instead of worker->task in worker_maybe_bind_and_lock() worker_maybe_bind_and_lock() uses both @worker->task and @current at the same time. As worker_maybe_bind_and_lock() can only be called by the current worker task, they are always the same. Update worker_maybe_bind_and_lock() to use %current consistently. This doesn't introduce any functional change. tj: Massaged the description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811eb..f456433cf535 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1512,7 +1512,7 @@ static void worker_leave_idle(struct worker *worker) * flushed from cpu callbacks while cpu is going down, they are * guaranteed to execute on the cpu. * - * This function is to be used by rogue workers and rescuers to bind + * This function is to be used by unbound workers and rescuers to bind * themselves to the target cpu and may race with cpu going down or * coming online. kthread_bind() can't be used because it may put the * worker to already dead cpu and set_cpus_allowed_ptr() can't be used @@ -1537,7 +1537,6 @@ static bool worker_maybe_bind_and_lock(struct worker *worker) __acquires(&pool->lock) { struct worker_pool *pool = worker->pool; - struct task_struct *task = worker->task; while (true) { /* @@ -1547,12 +1546,12 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu)); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; - if (task_cpu(task) == pool->cpu && + if (task_cpu(current) == pool->cpu && cpumask_equal(¤t->cpus_allowed, get_cpu_mask(pool->cpu))) return true; -- cgit v1.2.3 From f36dc67b27a689eeb3631b11ebef17bbff257fbb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Tue, 19 Feb 2013 12:17:02 -0800 Subject: workqueue: change argument of worker_maybe_bind_and_lock() to @pool worker_maybe_bind_and_lock() currently takes @worker but only cares about @worker->pool. This patch updates worker_maybe_bind_and_lock() to take @pool instead of @worker. This will be used to better define synchronization rules regarding rescuer->pool updates. This doesn't introduce any functional change. tj: Updated the comments and description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f456433cf535..09545d445a55 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1504,8 +1504,10 @@ static void worker_leave_idle(struct worker *worker) } /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool - * @worker: self + * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it + * @pool: target worker_pool + * + * Bind %current to the cpu of @pool if it is associated and lock @pool. * * Works which are scheduled while the cpu is online must at least be * scheduled to a worker which is bound to the cpu so that if they are @@ -1533,11 +1535,9 @@ static void worker_leave_idle(struct worker *worker) * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ -static bool worker_maybe_bind_and_lock(struct worker *worker) +static bool worker_maybe_bind_and_lock(struct worker_pool *pool) __acquires(&pool->lock) { - struct worker_pool *pool = worker->pool; - while (true) { /* * The following call may fail, succeed or succeed @@ -1575,7 +1575,7 @@ __acquires(&pool->lock) static void idle_worker_rebind(struct worker *worker) { /* CPU may go down again inbetween, clear UNBOUND only on success */ - if (worker_maybe_bind_and_lock(worker)) + if (worker_maybe_bind_and_lock(worker->pool)) worker_clr_flags(worker, WORKER_UNBOUND); /* rebind complete, become available again */ @@ -1593,7 +1593,7 @@ static void busy_worker_rebind_fn(struct work_struct *work) { struct worker *worker = container_of(work, struct worker, rebind_work); - if (worker_maybe_bind_and_lock(worker)) + if (worker_maybe_bind_and_lock(worker->pool)) worker_clr_flags(worker, WORKER_UNBOUND); spin_unlock_irq(&worker->pool->lock); @@ -2038,7 +2038,7 @@ static bool manage_workers(struct worker *worker) * on @pool's current state. Try it and adjust * %WORKER_UNBOUND accordingly. */ - if (worker_maybe_bind_and_lock(worker)) + if (worker_maybe_bind_and_lock(pool)) worker->flags &= ~WORKER_UNBOUND; else worker->flags |= WORKER_UNBOUND; @@ -2358,7 +2358,7 @@ repeat: /* migrate to the target cpu if possible */ rescuer->pool = pool; - worker_maybe_bind_and_lock(rescuer); + worker_maybe_bind_and_lock(pool); /* * Slurp in all works issued via this workqueue and -- cgit v1.2.3 From b31041042a8cdece67f925e4bae55b5f5fd754ca Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Tue, 19 Feb 2013 12:17:02 -0800 Subject: workqueue: better define synchronization rule around rescuer->pool updates Rescuers visit different worker_pools to process work items from pools under pressure. Currently, rescuer->pool is updated outside any locking and when an outsider looks at a rescuer, there's no way to tell when and whether rescuer->pool is gonna change. While this doesn't currently cause any problem, it is nasty. With recent worker_maybe_bind_and_lock() changes, we can move rescuer->pool updates inside pool locks such that if rescuer->pool equals a locked pool, it's guaranteed to stay that way until the pool is unlocked. Move rescuer->pool inside pool->lock. This patch doesn't introduce any visible behavior difference. tj: Updated the description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 3 ++- kernel/workqueue_internal.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 09545d445a55..fd9a28a13afd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2357,8 +2357,8 @@ repeat: mayday_clear_cpu(cpu, wq->mayday_mask); /* migrate to the target cpu if possible */ - rescuer->pool = pool; worker_maybe_bind_and_lock(pool); + rescuer->pool = pool; /* * Slurp in all works issued via this workqueue and @@ -2379,6 +2379,7 @@ repeat: if (keep_working(pool)) wake_up_worker(pool); + rescuer->pool = NULL; spin_unlock_irq(&pool->lock); } diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 07650264ec15..f9c887731e2b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,6 +32,7 @@ struct worker { struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ + /* L: for rescuers */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ -- cgit v1.2.3 From 65dff759d2948cf18e2029fc5c0c595b8b7da3a5 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 1 Mar 2013 15:01:56 +0800 Subject: cgroup: fix cgroup_path() vs rename() race rename() will change dentry->d_name. The result of this race can be worse than seeing partially rewritten name, but we might access a stale pointer because rename() will re-allocate memory to hold a longer name. As accessing dentry->name must be protected by dentry->d_lock or parent inode's i_mutex, while on the other hand cgroup-path() can be called with some irq-safe spinlocks held, we can't generate cgroup path using dentry->d_name. Alternatively we make a copy of dentry->d_name and save it in cgrp->name when a cgroup is created, and update cgrp->name at rename(). v5: use flexible array instead of zero-size array. v4: - allocate root_cgroup_name and all root_cgroup->name points to it. - add cgroup_name() wrapper. v3: use kfree_rcu() instead of synchronize_rcu() in user-visible path. v2: make cgrp->name RCU safe. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- block/blk-cgroup.h | 2 - include/linux/cgroup.h | 24 +++++++++++ kernel/cgroup.c | 106 +++++++++++++++++++++++++++++++++++-------------- 3 files changed, 100 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index f2b292925ccd..4e595ee8c915 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) { int ret; - rcu_read_lock(); ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - rcu_read_unlock(); if (ret) strncpy(buf, "<unavailable>", buflen); return ret; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 900af5964f55..75c6ec1ba1ba 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -150,6 +150,11 @@ enum { CGRP_CPUSET_CLONE_CHILDREN, }; +struct cgroup_name { + struct rcu_head rcu_head; + char name[]; +}; + struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ @@ -172,6 +177,19 @@ struct cgroup { struct cgroup *parent; /* my parent */ struct dentry *dentry; /* cgroup fs entry, RCU protected */ + /* + * This is a copy of dentry->d_name, and it's needed because + * we can't use dentry->d_name in cgroup_path(). + * + * You must acquire rcu_read_lock() to access cgrp->name, and + * the only place that can change it is rename(), which is + * protected by parent dir's i_mutex. + * + * Normally you should use cgroup_name() wrapper rather than + * access it directly. + */ + struct cgroup_name __rcu *name; + /* Private pointers for each registered subsystem */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; @@ -404,6 +422,12 @@ struct cgroup_scanner { void *data; }; +/* Caller should hold rcu_read_lock() */ +static inline const char *cgroup_name(const struct cgroup *cgrp) +{ + return rcu_dereference(cgrp->name)->name; +} + int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..50682168abc2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) +static struct cgroup_name root_cgroup_name = { .name = "/" }; + /* This flag indicates whether tasks in the fork and exit paths should * check for fork/exit handlers to call. This avoids us having to do * extra work in the fork/exit path if none of the subsystems need to @@ -859,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } +static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) +{ + struct cgroup_name *name; + + name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL); + if (!name) + return NULL; + strcpy(name->name, dentry->d_name.name); + return name; +} + static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, free_work); @@ -889,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work) simple_xattrs_free(&cgrp->xattrs); ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } @@ -1421,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) INIT_LIST_HEAD(&root->allcg_list); root->number_of_cgroups = 1; cgrp->root = root; + cgrp->name = &root_cgroup_name; cgrp->top_cgroup = cgrp; init_cgroup_housekeeping(cgrp); list_add_tail(&cgrp->allcg_node, &root->allcg_list); @@ -1769,49 +1784,45 @@ static struct kobject *cgroup_kobj; * @buf: the buffer to write the path into * @buflen: the length of the buffer * - * Called with cgroup_mutex held or else with an RCU-protected cgroup - * reference. Writes path of cgroup into buf. Returns 0 on success, - * -errno on error. + * Writes path of cgroup into buf. Returns 0 on success, -errno on error. + * + * We can't generate cgroup path using dentry->d_name, as accessing + * dentry->name must be protected by irq-unsafe dentry->d_lock or parent + * inode's i_mutex, while on the other hand cgroup_path() can be called + * with some irq-safe spinlocks held. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { - struct dentry *dentry = cgrp->dentry; + int ret = -ENAMETOOLONG; char *start; - rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), - "cgroup_path() called without proper locking"); - - if (cgrp == dummytop) { - /* - * Inactive subsystems have no dentry for their root - * cgroup - */ - strcpy(buf, "/"); - return 0; - } - start = buf + buflen - 1; - *start = '\0'; - for (;;) { - int len = dentry->d_name.len; + rcu_read_lock(); + while (cgrp) { + const char *name = cgroup_name(cgrp); + int len; + + len = strlen(name); if ((start -= len) < buf) - return -ENAMETOOLONG; - memcpy(start, dentry->d_name.name, len); - cgrp = cgrp->parent; - if (!cgrp) - break; + goto out; + memcpy(start, name, len); - dentry = cgrp->dentry; if (!cgrp->parent) - continue; + break; + if (--start < buf) - return -ENAMETOOLONG; + goto out; *start = '/'; + + cgrp = cgrp->parent; } + ret = 0; memmove(buf, start, buf + buflen - start); - return 0; +out: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL_GPL(cgroup_path); @@ -2537,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file) static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { + int ret; + struct cgroup_name *name, *old_name; + struct cgroup *cgrp; + + /* + * It's convinient to use parent dir's i_mutex to protected + * cgrp->name. + */ + lockdep_assert_held(&old_dir->i_mutex); + if (!S_ISDIR(old_dentry->d_inode->i_mode)) return -ENOTDIR; if (new_dentry->d_inode) return -EEXIST; if (old_dir != new_dir) return -EIO; - return simple_rename(old_dir, old_dentry, new_dir, new_dentry); + + cgrp = __d_cgrp(old_dentry); + + name = cgroup_alloc_name(new_dentry); + if (!name) + return -ENOMEM; + + ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + kfree(name); + return ret; + } + + old_name = cgrp->name; + rcu_assign_pointer(cgrp->name, name); + + kfree_rcu(old_name, rcu_head); + return 0; } static struct simple_xattrs *__d_xattrs(struct dentry *dentry) @@ -4158,6 +4196,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { struct cgroup *cgrp; + struct cgroup_name *name; struct cgroupfs_root *root = parent->root; int err = 0; struct cgroup_subsys *ss; @@ -4168,9 +4207,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + name = cgroup_alloc_name(dentry); + if (!name) + goto err_free_cgrp; + rcu_assign_pointer(cgrp->name, name); + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); if (cgrp->id < 0) - goto err_free_cgrp; + goto err_free_name; /* * Only live parents can have children. Note that the liveliness @@ -4276,6 +4320,8 @@ err_free_all: deactivate_super(sb); err_free_id: ida_simple_remove(&root->cgroup_ida, cgrp->id); +err_free_name: + kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: kfree(cgrp); return err; -- cgit v1.2.3 From f440d98f8ebab02a768c1de17395e4239af9a97d Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 1 Mar 2013 15:02:15 +0800 Subject: cpuset: use cgroup_name() in cpuset_print_task_mems_allowed() Use cgroup_name() instead of cgrp->dentry->name. This makes the code a bit simpler. While at it, remove cpuset_name and make cpuset_nodelist a local variable to cpuset_print_task_mems_allowed(). Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cpuset.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..ace5bfcdcb30 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -264,17 +264,6 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_MUTEX(callback_mutex); -/* - * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist - * buffers. They are statically allocated to prevent using excess stack - * when calling cpuset_print_task_mems_allowed(). - */ -#define CPUSET_NAME_LEN (128) -#define CPUSET_NODELIST_LEN (256) -static char cpuset_name[CPUSET_NAME_LEN]; -static char cpuset_nodelist[CPUSET_NODELIST_LEN]; -static DEFINE_SPINLOCK(cpuset_buffer_lock); - /* * CPU / memory hotplug is handled asynchronously. */ @@ -2592,6 +2581,8 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); } +#define CPUSET_NODELIST_LEN (256) + /** * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed * @task: pointer to task_struct of some task. @@ -2602,24 +2593,19 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, */ void cpuset_print_task_mems_allowed(struct task_struct *tsk) { - struct dentry *dentry; + /* Statically allocated to prevent using excess stack. */ + static char cpuset_nodelist[CPUSET_NODELIST_LEN]; + static DEFINE_SPINLOCK(cpuset_buffer_lock); - dentry = task_cs(tsk)->css.cgroup->dentry; - spin_lock(&cpuset_buffer_lock); + struct cgroup *cgrp = task_cs(tsk)->css.cgroup; - if (!dentry) { - strcpy(cpuset_name, "/"); - } else { - spin_lock(&dentry->d_lock); - strlcpy(cpuset_name, (const char *)dentry->d_name.name, - CPUSET_NAME_LEN); - spin_unlock(&dentry->d_lock); - } + spin_lock(&cpuset_buffer_lock); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, tsk->mems_allowed); printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", - tsk->comm, cpuset_name, cpuset_nodelist); + tsk->comm, cgroup_name(cgrp), cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); } -- cgit v1.2.3 From f50daa704f36a6544a902c52b6cf37b0493dfc5d Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 1 Mar 2013 15:06:07 +0800 Subject: cgroup: no need to check css refs for release notification We no longer fail rmdir() when there're still css refs, so we don't need to check css refs in check_for_release(). This also voids a bug. cgroup_has_css_refs() accesses subsys[i] without cgroup_mutex, so it can race with cgroup_unload_subsys(). cgroup_has_css_refs() ... if (ss == NULL || ss->root != cgrp->root) if ss pointers to net_cls_subsys, and cls_cgroup module is unloaded right after the former check but before the latter, the memory that net_cls_subsys resides has become invalid. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 67 +++++++-------------------------------------------------- 1 file changed, 8 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 50682168abc2..9df799d5d31c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4341,47 +4341,6 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -/* - * Check the reference count on each subsystem. Since we already - * established that there are no tasks in the cgroup, if the css refcount - * is also 1, then there should be no outstanding references, so the - * subsystem is safe to destroy. We scan across all subsystems rather than - * using the per-hierarchy linked list of mounted subsystems since we can - * be called via check_for_release() with no synchronization other than - * RCU, and the subsystem linked list isn't RCU-safe. - */ -static int cgroup_has_css_refs(struct cgroup *cgrp) -{ - int i; - - /* - * We won't need to lock the subsys array, because the subsystems - * we're concerned about aren't going anywhere since our cgroup root - * has a reference on them. - */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - struct cgroup_subsys_state *css; - - /* Skip subsystems not present or not in this hierarchy */ - if (ss == NULL || ss->root != cgrp->root) - continue; - - css = cgrp->subsys[ss->subsys_id]; - /* - * When called from check_for_release() it's possible - * that by this point the cgroup has been removed - * and the css deleted. But a false-positive doesn't - * matter, since it can only happen if the cgroup - * has been deleted and hence no longer needs the - * release agent to be called anyway. - */ - if (css && css_refcnt(css) > 1) - return 1; - } - return 0; -} - static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { @@ -5108,12 +5067,15 @@ static void check_for_release(struct cgroup *cgrp) { /* All of these checks rely on RCU to keep the cgroup * structure alive */ - if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) - && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { - /* Control Group is currently removeable. If it's not + if (cgroup_is_releasable(cgrp) && + !atomic_read(&cgrp->count) && list_empty(&cgrp->children)) { + /* + * Control Group is currently removeable. If it's not * already queued for a userspace notification, queue - * it now */ + * it now + */ int need_schedule_work = 0; + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { @@ -5146,24 +5108,11 @@ EXPORT_SYMBOL_GPL(__css_tryget); /* Caller must verify that the css is not for root cgroup */ void __css_put(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = css->cgroup; int v; - rcu_read_lock(); v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); - - switch (v) { - case 1: - if (notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } - break; - case 0: + if (v == 0) schedule_work(&css->dput_work); - break; - } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(__css_put); -- cgit v1.2.3 From 7d8e0bf56a66bab08d2f316dd87e56c08cecb899 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 5 Mar 2013 10:57:03 +0800 Subject: cgroup: avoid accessing modular cgroup subsys structure without locking subsys[i] is set to NULL in cgroup_unload_subsys() at modular unload, and that's protected by cgroup_mutex, and then the memory *subsys[i] resides will be freed. So this is unsafe without any locking: if (!ss || ss->module) ... v2: - add a comment for enum cgroup_subsys_id - simplify the comment in cgroup_exit() Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 17 ++++++++++++++--- kernel/cgroup.c | 28 ++++++++++++++-------------- 2 files changed, 28 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 75c6ec1ba1ba..5f76829dd75e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -44,14 +44,25 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss); extern const struct file_operations proc_cgroup_operations; -/* Define the enumeration of all builtin cgroup subsystems */ +/* + * Define the enumeration of all cgroup subsystems. + * + * We define ids for builtin subsystems and then modular ones. + */ #define SUBSYS(_x) _x ## _subsys_id, -#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option) enum cgroup_subsys_id { +#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) +#include <linux/cgroup_subsys.h> +#undef IS_SUBSYS_ENABLED + CGROUP_BUILTIN_SUBSYS_COUNT, + + __CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1, + +#define IS_SUBSYS_ENABLED(option) IS_MODULE(option) #include <linux/cgroup_subsys.h> +#undef IS_SUBSYS_ENABLED CGROUP_SUBSYS_COUNT, }; -#undef IS_SUBSYS_ENABLED #undef SUBSYS /* Per-subsystem/per-cgroup state maintained by the system. */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9df799d5d31c..7a6c4c72ca55 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4940,17 +4940,17 @@ void cgroup_post_fork(struct task_struct *child) * and addition to css_set. */ if (need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * fork/exit callbacks are supported only for builtin + * subsystems, and the builtin section of the subsys + * array is immutable, so we don't need to lock the + * subsys array here. On the other hand, modular section + * of the array can be freed at module unload, so we + * can't touch that. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - /* - * fork/exit callbacks are supported only for - * builtin subsystems and we don't need further - * synchronization as they never go away. - */ - if (!ss || ss->module) - continue; - if (ss->fork) ss->fork(child); } @@ -5015,13 +5015,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) tsk->cgroups = &init_css_set; if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * fork/exit callbacks are supported only for builtin + * subsystems, see cgroup_post_fork() for details. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - /* modular subsystems can't use callbacks */ - if (!ss || ss->module) - continue; - if (ss->exit) { struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; -- cgit v1.2.3 From 877c685607925238e302cd3aa38788dca6c1b226 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 5 Mar 2013 11:38:08 +0800 Subject: perf: Remove include of cgroup.h from perf_event.h Move struct perf_cgroup_info and perf_cgroup to kernel/perf/core.c, and then we can remove include of cgroup.h. Signed-off-by: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Stephane Eranian <eranian@google.com> Cc: Tejun Heo <tj@kernel.org> Link: http://lkml.kernel.org/r/513568A0.6020804@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- include/linux/perf_event.h | 18 +----------------- kernel/events/core.c | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e47ee462c2f2..8737e1cee8b2 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -21,7 +21,6 @@ */ #ifdef CONFIG_PERF_EVENTS -# include <linux/cgroup.h> # include <asm/perf_event.h> # include <asm/local64.h> #endif @@ -299,22 +298,7 @@ struct swevent_hlist { #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 -#ifdef CONFIG_CGROUP_PERF -/* - * perf_cgroup_info keeps track of time_enabled for a cgroup. - * This is a per-cpu dynamically allocated data structure. - */ -struct perf_cgroup_info { - u64 time; - u64 timestamp; -}; - -struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info *info; /* timing info, one per cpu */ -}; -#endif - +struct perf_cgroup; struct ring_buffer; /** diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c30..5976a2a6b4ce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -37,6 +37,7 @@ #include <linux/ftrace_event.h> #include <linux/hw_breakpoint.h> #include <linux/mm_types.h> +#include <linux/cgroup.h> #include "internal.h" @@ -233,6 +234,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info *info; +}; + /* * Must ensure cgroup is pinned (css_get) before calling * this function. In other words, we cannot call this function -- cgit v1.2.3 From d8741e2e88ac9a458765a0c7b4e6542d7c038334 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 10:25:16 -0500 Subject: tracing: Add help of snapshot feature when snapshot is empty When cat'ing the snapshot file, instead of showing an empty trace header like the trace file does, show how to use the snapshot feature. Also, this is a good place to show if the snapshot has been allocated or not. Users may want to "pre allocate" the snapshot to have a fast "swap" of the current buffer. Otherwise, a swap would be slow and might fail as it would need to allocate the snapshot buffer, and that might fail under tight memory constraints. Here's what it looked like before: # tracer: nop # # entries-in-buffer/entries-written: 0/0 #P:4 # # _-----=> irqs-off # / _----=> need-resched # | / _---=> hardirq/softirq # || / _--=> preempt-depth # ||| / delay # TASK-PID CPU# |||| TIMESTAMP FUNCTION # | | | |||| | | Here's what it looks like now: # tracer: nop # # # * Snapshot is freed * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') Acked-by: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c2e2c2310374..9e3120b8a2ad 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2400,6 +2400,27 @@ static void test_ftrace_alive(struct seq_file *m) seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); } +#ifdef CONFIG_TRACER_MAX_TRACE +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->trace->allocated_snapshot) + seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_printf(m, "# Snapshot commands:\n"); + seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer.\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2411,7 +2432,9 @@ static int s_show(struct seq_file *m, void *v) seq_puts(m, "#\n"); test_ftrace_alive(m); } - if (iter->trace && iter->trace->print_header) + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); -- cgit v1.2.3 From c9960e48543799f168c4c9486f9790fb686ce5a8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 10:53:02 -0500 Subject: tracing: Do not return EINVAL in snapshot when not allocated To use the tracing snapshot feature, writing a '1' into the snapshot file causes the snapshot buffer to be allocated if it has not already been allocated and dose a 'swap' with the main buffer, so that the snapshot now contains what was in the main buffer, and the main buffer now writes to what was the snapshot buffer. To free the snapshot buffer, a '0' is written into the snapshot file. To clear the snapshot buffer, any number but a '0' or '1' is written into the snapshot file. But if the file is not allocated it returns -EINVAL error code. This is rather pointless. It is better just to do nothing and return success. Acked-by: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9e3120b8a2ad..1f835a83cb2c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4167,8 +4167,6 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, default: if (current_trace->allocated_snapshot) tracing_reset_online_cpus(&max_tr); - else - ret = -EINVAL; break; } -- cgit v1.2.3 From a7dc19b8652c862d5b7c4d2339bd3c428bd29c4a Mon Sep 17 00:00:00 2001 From: Mark Rutland <mark.rutland@arm.com> Date: Thu, 7 Mar 2013 15:09:24 +0000 Subject: clockevents: Don't allow dummy broadcast timers Currently tick_check_broadcast_device doesn't reject clock_event_devices with CLOCK_EVT_FEAT_DUMMY, and may select them in preference to real hardware if they have a higher rating value. In this situation, the dummy timer is responsible for broadcasting to itself, and the core clockevents code may attempt to call non-existent callbacks for programming the dummy, eventually leading to a panic. This patch makes tick_check_broadcast_device always reject dummy timers, preventing this problem. Signed-off-by: Mark Rutland <mark.rutland@arm.com> Cc: linux-arm-kernel@lists.infradead.org Cc: Jon Medhurst (Tixy) <tixy@linaro.org> Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/time/tick-broadcast.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8d..7f32fe0e52cd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if ((tick_broadcast_device.evtdev && + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || + (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; -- cgit v1.2.3 From dc893e19b5800d7743fb58235877bfa9091805ff Mon Sep 17 00:00:00 2001 From: Arnd Bergmann <arnd@arndb.de> Date: Fri, 8 Mar 2013 12:43:31 -0800 Subject: Revert parts of "hlist: drop the node parameter from iterators" Commit b67bfe0d42ca ("hlist: drop the node parameter from iterators") did a lot of nice changes but also contains two small hunks that seem to have slipped in accidentally and have no apparent connection to the intent of the patch. This reverts the two extraneous changes. Signed-off-by: Arnd Bergmann <arnd@arndb.de> Cc: Peter Senna Tschudin <peter.senna@gmail.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Sasha Levin <sasha.levin@oracle.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/smpboot.c | 2 +- net/9p/trans_virtio.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 25d3d8b6e4e1..8eaed9aa9cf0 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data) continue; } - //BUG_ON(td->cpu != smp_processor_id()); + BUG_ON(td->cpu != smp_processor_id()); /* Check for state change setup */ switch (td->status) { diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 74dea377fe5b..de2e950a0a7a 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -655,7 +655,7 @@ static struct p9_trans_module p9_virtio_trans = { .create = p9_virtio_create, .close = p9_virtio_close, .request = p9_virtio_request, - //.zc_request = p9_virtio_zc_request, + .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, /* * We leave one entry for input and one entry for response -- cgit v1.2.3 From eb2834285cf172856cd12f66892fc7467935ebed Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Fri, 8 Mar 2013 15:18:28 -0800 Subject: workqueue: fix possible pool stall bug in wq_unbind_fn() Since multiple pools per cpu have been introduced, wq_unbind_fn() has a subtle bug which may theoretically stall work item processing. The problem is two-fold. * wq_unbind_fn() depends on the worker executing wq_unbind_fn() itself to start unbound chain execution, which works fine when there was only single pool. With multiple pools, only the pool which is running wq_unbind_fn() - the highpri one - is guaranteed to have such kick-off. The other pool could stall when its busy workers block. * The current code is setting WORKER_UNBIND / POOL_DISASSOCIATED of the two pools in succession without initiating work execution inbetween. Because setting the flags requires grabbing assoc_mutex which is held while new workers are created, this could lead to stalls if a pool's manager is waiting for the previous pool's work items to release memory. This is almost purely theoretical tho. Update wq_unbind_fn() such that it sets WORKER_UNBIND / POOL_DISASSOCIATED, goes over schedule() and explicitly kicks off execution for a pool and then moves on to the next one. tj: Updated comments and description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> Cc: stable@vger.kernel.org --- kernel/workqueue.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811eb..604801b91cba 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3446,28 +3446,34 @@ static void wq_unbind_fn(struct work_struct *work) spin_unlock_irq(&pool->lock); mutex_unlock(&pool->assoc_mutex); - } - /* - * Call schedule() so that we cross rq->lock and thus can guarantee - * sched callbacks see the %WORKER_UNBOUND flag. This is necessary - * as scheduler callbacks may be invoked from other cpus. - */ - schedule(); + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the %WORKER_UNBOUND flag. + * This is necessary as scheduler callbacks may be invoked + * from other cpus. + */ + schedule(); - /* - * Sched callbacks are disabled now. Zap nr_running. After this, - * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. Pools on - * @cpu now behave as unbound (in terms of concurrency management) - * pools which are served by workers tied to the CPU. - * - * On return from this function, the current worker would trigger - * unbound chain execution of pending work items if other workers - * didn't already. - */ - for_each_std_worker_pool(pool, cpu) + /* + * Sched callbacks are disabled now. Zap nr_running. + * After this, nr_running stays zero and need_more_worker() + * and keep_working() are always true as long as the + * worklist is not empty. This pool now behaves as an + * unbound (in terms of concurrency management) pool which + * are served by workers tied to the pool. + */ atomic_set(&pool->nr_running, 0); + + /* + * With concurrency management just turned off, a busy + * worker blocking could lead to lengthy stalls. Kick off + * unbound chain execution of currently pending work items. + */ + spin_lock_irq(&pool->lock); + wake_up_worker(pool); + spin_unlock_irq(&pool->lock); + } } /* -- cgit v1.2.3 From 2721e72dd10f71a3ba90f59781becf02638aa0d9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 11:32:32 -0400 Subject: tracing: Fix race in snapshot swapping Although the swap is wrapped with a spin_lock, the assignment of the temp buffer used to swap is not within that lock. It needs to be moved into that lock, otherwise two swaps happening on two different CPUs, can end up using the wrong temp buffer to assign in the swap. Luckily, all current callers of the swap function appear to have their own locks. But in case something is added that allows two different callers to call the swap, then there's a chance that this race can trigger and corrupt the buffers. New code is coming soon that will allow for this race to trigger. I've Cc'd stable, so this bug will not show up if someone backports one of the changes that can trigger this bug. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1f835a83cb2c..53df2839bb93 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -704,7 +704,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; if (trace_stop_count) return; @@ -719,6 +719,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); + buf = tr->buffer; tr->buffer = max_tr.buffer; max_tr.buffer = buf; -- cgit v1.2.3 From 34ed62461ae4970695974afb9a60ac3df0086830 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Mon, 7 Jan 2013 13:37:42 -0800 Subject: rcu: Remove restrictions on no-CBs CPUs Currently, CPU 0 is constrained to not be a no-CBs CPU, and furthermore at least one no-CBs CPU must remain online at any given time. These restrictions are problematic in some situations, such as cases where all CPUs must run a real-time workload that needs to be insulated from OS jitter and latencies due to RCU callback invocation. This commit therefore provides no-CBs CPUs a (very crude and energy-inefficient) way to start and to wait for grace periods independently of the normal RCU callback mechanisms. This approach allows any or all of the CPUs to be designated as no-CBs CPUs, and allows any proper subset of the CPUs (whether no-CBs CPUs or not) to be offlined. This commit also provides a fix for a locking bug spotted by Xie ChanglongX <changlongx.xie@intel.com>. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- init/Kconfig | 4 +- kernel/rcutree.c | 14 ++--- kernel/rcutree.h | 12 +--- kernel/rcutree_plugin.h | 156 ++++++++++++++---------------------------------- 4 files changed, 57 insertions(+), 129 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 22616cd434bc..c8bd349eb638 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -655,7 +655,7 @@ config RCU_BOOST_DELAY Accept the default if unsure. config RCU_NOCB_CPU - bool "Offload RCU callback processing from boot-selected CPUs" + bool "Offload RCU callback processing from boot-selected CPUs (EXPERIMENTAL" depends on TREE_RCU || TREE_PREEMPT_RCU default n help @@ -673,7 +673,7 @@ config RCU_NOCB_CPU callback, and (2) affinity or cgroups can be used to force the kthreads to run on whatever set of CPUs is desired. - Say Y here if you want reduced OS jitter on selected CPUs. + Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. endmenu # "RCU Subsystem" diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..6ad0716e65dc 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -310,6 +310,8 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ + if (rcu_nocb_needs_gp(rdp)) + return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) @@ -1035,10 +1037,11 @@ static void init_callback_list(struct rcu_data *rdp) { int i; + if (init_nocb_callback_list(rdp)) + return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - init_nocb_callback_list(rdp); } /* @@ -2909,7 +2912,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - int ret = NOTIFY_OK; trace_rcu_utilization("Start CPU hotplug"); switch (action) { @@ -2923,10 +2925,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, rcu_boost_kthread_setaffinity(rnp, -1); break; case CPU_DOWN_PREPARE: - if (nocb_cpu_expendable(cpu)) - rcu_boost_kthread_setaffinity(rnp, cpu); - else - ret = NOTIFY_BAD; + rcu_boost_kthread_setaffinity(rnp, cpu); break; case CPU_DYING: case CPU_DYING_FROZEN: @@ -2950,7 +2949,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; } trace_rcu_utilization("End CPU hotplug"); - return ret; + return NOTIFY_OK; } /* @@ -3170,7 +3169,6 @@ void __init rcu_init(void) rcu_init_one(&rcu_sched_state, &rcu_sched_data); rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); - rcu_init_nocb(); open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9d..7af39f4aaac4 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -326,6 +326,7 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; + bool nocb_needs_gp; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int cpu; @@ -375,12 +376,6 @@ struct rcu_state { struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ void (*func)(struct rcu_head *head)); -#ifdef CONFIG_RCU_NOCB_CPU - void (*call_remote)(struct rcu_head *head, - void (*func)(struct rcu_head *head)); - /* call_rcu() flavor, but for */ - /* placing on remote CPU. */ -#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* The following fields are guarded by the root rcu_node's lock. */ @@ -529,16 +524,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); +static int rcu_nocb_needs_gp(struct rcu_data *rdp); static bool is_nocb_cpu(int cpu); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp); -static bool nocb_cpu_expendable(int cpu); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); -static void init_nocb_callback_list(struct rcu_data *rdp); -static void __init rcu_init_nocb(void); +static bool init_nocb_callback_list(struct rcu_data *rdp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..44f958a88b21 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -86,10 +86,6 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU if (have_rcu_nocb_mask) { - if (cpumask_test_cpu(0, rcu_nocb_mask)) { - cpumask_clear_cpu(0, rcu_nocb_mask); - pr_info("\tCPU 0: illegal no-CBs CPU (cleared).\n"); - } cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); if (rcu_nocb_poll) @@ -2165,6 +2161,14 @@ static int __init parse_rcu_nocb_poll(char *arg) } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); +/* + * Does this CPU needs a grace period due to offloaded callbacks? + */ +static int rcu_nocb_needs_gp(struct rcu_data *rdp) +{ + return rdp->nocb_needs_gp; +} + /* Is the specified CPU a no-CPUs CPU? */ static bool is_nocb_cpu(int cpu) { @@ -2265,95 +2269,39 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, } /* - * There must be at least one non-no-CBs CPU in operation at any given - * time, because no-CBs CPUs are not capable of initiating grace periods - * independently. This function therefore complains if the specified - * CPU is the last non-no-CBs CPU, allowing the CPU-hotplug system to - * avoid offlining the last such CPU. (Recursion is a wonderful thing, - * but you have to have a base case!) + * If necessary, kick off a new grace period, and either way wait + * for a subsequent grace period to complete. */ -static bool nocb_cpu_expendable(int cpu) +static void rcu_nocb_wait_gp(struct rcu_data *rdp) { - cpumask_var_t non_nocb_cpus; - int ret; + unsigned long c; + unsigned long flags; + unsigned long j; + struct rcu_node *rnp = rdp->mynode; + + raw_spin_lock_irqsave(&rnp->lock, flags); + c = rnp->completed + 2; + rdp->nocb_needs_gp = true; + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* - * If there are no no-CB CPUs or if this CPU is not a no-CB CPU, - * then offlining this CPU is harmless. Let it happen. + * Wait for the grace period. Do so interruptibly to avoid messing + * up the load average. */ - if (!have_rcu_nocb_mask || is_nocb_cpu(cpu)) - return 1; - - /* If no memory, play it safe and keep the CPU around. */ - if (!alloc_cpumask_var(&non_nocb_cpus, GFP_NOIO)) - return 0; - cpumask_andnot(non_nocb_cpus, cpu_online_mask, rcu_nocb_mask); - cpumask_clear_cpu(cpu, non_nocb_cpus); - ret = !cpumask_empty(non_nocb_cpus); - free_cpumask_var(non_nocb_cpus); - return ret; -} - -/* - * Helper structure for remote registry of RCU callbacks. - * This is needed for when a no-CBs CPU needs to start a grace period. - * If it just invokes call_rcu(), the resulting callback will be queued, - * which can result in deadlock. - */ -struct rcu_head_remote { - struct rcu_head *rhp; - call_rcu_func_t *crf; - void (*func)(struct rcu_head *rhp); -}; - -/* - * Register a callback as specified by the rcu_head_remote struct. - * This function is intended to be invoked via smp_call_function_single(). - */ -static void call_rcu_local(void *arg) -{ - struct rcu_head_remote *rhrp = - container_of(arg, struct rcu_head_remote, rhp); - - rhrp->crf(rhrp->rhp, rhrp->func); -} - -/* - * Set up an rcu_head_remote structure and the invoke call_rcu_local() - * on CPU 0 (which is guaranteed to be a non-no-CBs CPU) via - * smp_call_function_single(). - */ -static void invoke_crf_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp), - call_rcu_func_t crf) -{ - struct rcu_head_remote rhr; - - rhr.rhp = rhp; - rhr.crf = crf; - rhr.func = func; - smp_call_function_single(0, call_rcu_local, &rhr, 1); -} - -/* - * Helper functions to be passed to wait_rcu_gp(), each of which - * invokes invoke_crf_remote() to register a callback appropriately. - */ -static void __maybe_unused -call_rcu_preempt_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu); -} -static void call_rcu_bh_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_bh); -} -static void call_rcu_sched_remote(struct rcu_head *rhp, - void (*func)(struct rcu_head *rhp)) -{ - invoke_crf_remote(rhp, func, call_rcu_sched); + for (;;) { + j = jiffies; + schedule_timeout_interruptible(2); + raw_spin_lock_irqsave(&rnp->lock, flags); + if (ULONG_CMP_GE(rnp->completed, c)) { + rdp->nocb_needs_gp = false; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; + } + if (j == jiffies) + flush_signals(current); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + smp_mb(); /* Ensure that CB invocation happens after GP end. */ } /* @@ -2390,7 +2338,7 @@ static int rcu_nocb_kthread(void *arg) cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); ACCESS_ONCE(rdp->nocb_p_count) += c; ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; - wait_rcu_gp(rdp->rsp->call_remote); + rcu_nocb_wait_gp(rdp); /* Each pass through the following loop invokes a callback. */ trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); @@ -2443,26 +2391,22 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) } /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ -static void init_nocb_callback_list(struct rcu_data *rdp) +static bool init_nocb_callback_list(struct rcu_data *rdp) { if (rcu_nocb_mask == NULL || !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) - return; + return false; rdp->nxttail[RCU_NEXT_TAIL] = NULL; + return true; } -/* Initialize the ->call_remote fields in the rcu_state structures. */ -static void __init rcu_init_nocb(void) +#else /* #ifdef CONFIG_RCU_NOCB_CPU */ + +static int rcu_nocb_needs_gp(struct rcu_data *rdp) { -#ifdef CONFIG_PREEMPT_RCU - rcu_preempt_state.call_remote = call_rcu_preempt_remote; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ - rcu_bh_state.call_remote = call_rcu_bh_remote; - rcu_sched_state.call_remote = call_rcu_sched_remote; + return 0; } -#else /* #ifdef CONFIG_RCU_NOCB_CPU */ - static bool is_nocb_cpu(int cpu) { return false; @@ -2480,11 +2424,6 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, return 0; } -static bool nocb_cpu_expendable(int cpu) -{ - return 1; -} - static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { } @@ -2493,12 +2432,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) { } -static void init_nocb_callback_list(struct rcu_data *rdp) -{ -} - -static void __init rcu_init_nocb(void) +static bool init_nocb_callback_list(struct rcu_data *rdp) { + return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ -- cgit v1.2.3 From 6183c009f6cd94b42e5812adcfd4ba6220a196e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: workqueue: make sanity checks less punshing using WARN_ON[_ONCE]()s Workqueue has been using mostly BUG_ON()s for sanity checks, which fail unnecessarily harshly when the assertion doesn't hold. Most assertions can converted to be less drastic such that things can limp along instead of dying completely. Convert BUG_ON()s to WARN_ON[_ONCE]()s with softer failure behaviors - e.g. if assertion check fails in destroy_worker(), trigger WARN and silently ignore destruction request. Most conversions are trivial. Note that sanity checks in destroy_workqueue() are moved above removal from workqueues list so that it can bail out without side-effects if assertion checks fail. This patch doesn't introduce any visible behavior changes during normal operation. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 85 +++++++++++++++++++++++++++++------------------------- 1 file changed, 46 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index fd9a28a13afd..c6e1bdb469ee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -530,7 +530,7 @@ static int work_next_color(int color) static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) { - BUG_ON(!work_pending(work)); + WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | flags | work_static(work)); } @@ -785,7 +785,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, pool = worker->pool; /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + return NULL; /* * The counterpart of the following dec_and_test, implied mb, @@ -1458,9 +1459,10 @@ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(worker->flags & WORKER_IDLE); - BUG_ON(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev)); + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; @@ -1497,7 +1499,8 @@ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(!(worker->flags & WORKER_IDLE)); + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); @@ -1793,8 +1796,9 @@ static void destroy_worker(struct worker *worker) int id = worker->id; /* sanity check frenzy */ - BUG_ON(worker->current_work); - BUG_ON(!list_empty(&worker->scheduled)); + if (WARN_ON(worker->current_work) || + WARN_ON(!list_empty(&worker->scheduled))) + return; if (worker->flags & WORKER_STARTED) pool->nr_workers--; @@ -1923,7 +1927,8 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); start_worker(worker); - BUG_ON(need_to_create_worker(pool)); + if (WARN_ON_ONCE(need_to_create_worker(pool))) + goto restart; return true; } @@ -2256,7 +2261,7 @@ recheck: * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ - BUG_ON(!list_empty(&worker->scheduled)); + WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* * When control reaches this point, we're guaranteed to have @@ -2364,7 +2369,7 @@ repeat: * Slurp in all works issued via this workqueue and * process'em. */ - BUG_ON(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); @@ -2499,7 +2504,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, unsigned int cpu; if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); + WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } @@ -2510,7 +2515,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, spin_lock_irq(&pool->lock); if (flush_color >= 0) { - BUG_ON(pwq->flush_color != -1); + WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; @@ -2520,7 +2525,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } if (work_color >= 0) { - BUG_ON(work_color != work_next_color(pwq->work_color)); + WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } @@ -2568,13 +2573,13 @@ void flush_workqueue(struct workqueue_struct *wq) * becomes our flush_color and work_color is advanced * by one. */ - BUG_ON(!list_empty(&wq->flusher_overflow)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; @@ -2587,7 +2592,7 @@ void flush_workqueue(struct workqueue_struct *wq) } } else { /* wait in queue */ - BUG_ON(wq->flush_color == this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } @@ -2621,8 +2626,8 @@ void flush_workqueue(struct workqueue_struct *wq) wq->first_flusher = NULL; - BUG_ON(!list_empty(&this_flusher.list)); - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(!list_empty(&this_flusher.list)); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; @@ -2635,8 +2640,8 @@ void flush_workqueue(struct workqueue_struct *wq) complete(&next->done); } - BUG_ON(!list_empty(&wq->flusher_overflow) && - wq->flush_color != work_next_color(wq->work_color)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); @@ -2660,7 +2665,7 @@ void flush_workqueue(struct workqueue_struct *wq) } if (list_empty(&wq->flusher_queue)) { - BUG_ON(wq->flush_color != wq->work_color); + WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } @@ -2668,8 +2673,8 @@ void flush_workqueue(struct workqueue_struct *wq) * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ - BUG_ON(wq->flush_color == wq->work_color); - BUG_ON(wq->flush_color != next->flush_color); + WARN_ON_ONCE(wq->flush_color == wq->work_color); + WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; @@ -3263,6 +3268,19 @@ void destroy_workqueue(struct workqueue_struct *wq) /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* sanity checks */ + for_each_pwq_cpu(cpu, wq) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + if (WARN_ON(pwq->nr_in_flight[i])) + return; + if (WARN_ON(pwq->nr_active) || + WARN_ON(!list_empty(&pwq->delayed_works))) + return; + } + /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. @@ -3271,17 +3289,6 @@ void destroy_workqueue(struct workqueue_struct *wq) list_del(&wq->list); spin_unlock(&workqueue_lock); - /* sanity check */ - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - int i; - - for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(pwq->nr_in_flight[i]); - BUG_ON(pwq->nr_active); - BUG_ON(!list_empty(&pwq->delayed_works)); - } - if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); free_mayday_mask(wq->mayday_mask); @@ -3424,7 +3431,7 @@ static void wq_unbind_fn(struct work_struct *work) int i; for_each_std_worker_pool(pool, cpu) { - BUG_ON(cpu != smp_processor_id()); + WARN_ON_ONCE(cpu != smp_processor_id()); mutex_lock(&pool->assoc_mutex); spin_lock_irq(&pool->lock); @@ -3594,7 +3601,7 @@ void freeze_workqueues_begin(void) spin_lock(&workqueue_lock); - BUG_ON(workqueue_freezing); + WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; for_each_wq_cpu(cpu) { @@ -3642,7 +3649,7 @@ bool freeze_workqueues_busy(void) spin_lock(&workqueue_lock); - BUG_ON(!workqueue_freezing); + WARN_ON_ONCE(!workqueue_freezing); for_each_wq_cpu(cpu) { struct workqueue_struct *wq; @@ -3656,7 +3663,7 @@ bool freeze_workqueues_busy(void) if (!pwq || !(wq->flags & WQ_FREEZABLE)) continue; - BUG_ON(pwq->nr_active < 0); + WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; goto out_unlock; -- cgit v1.2.3 From e98d5b16cf4df992c40a7c83f1eae61db5bb03da Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: workqueue: make workqueue_lock irq-safe workqueue_lock will be used to synchronize areas which require irq-safety and there isn't much benefit in keeping it not irq-safe. Make it irq-safe. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c6e1bdb469ee..c585d0ebd353 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2715,10 +2715,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (!wq->nr_drainers++) wq->flags |= WQ_DRAINING; - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); reflush: flush_workqueue(wq); @@ -2740,10 +2740,10 @@ reflush: goto reflush; } - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (!--wq->nr_drainers) wq->flags &= ~WQ_DRAINING; - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3233,7 +3233,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, * list. Grab it, set max_active accordingly and add the new * workqueue to workqueues list. */ - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (workqueue_freezing && wq->flags & WQ_FREEZABLE) for_each_pwq_cpu(cpu, wq) @@ -3241,7 +3241,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); return wq; err: @@ -3285,9 +3285,9 @@ void destroy_workqueue(struct workqueue_struct *wq) * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); list_del(&wq->list); - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); @@ -3336,7 +3336,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); wq->saved_max_active = max_active; @@ -3344,16 +3344,16 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) struct pool_workqueue *pwq = get_pwq(cpu, wq); struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); if (!(wq->flags & WQ_FREEZABLE) || !(pool->flags & POOL_FREEZING)) pwq_set_max_active(pwq, max_active); - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); @@ -3599,7 +3599,7 @@ void freeze_workqueues_begin(void) { unsigned int cpu; - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; @@ -3609,7 +3609,7 @@ void freeze_workqueues_begin(void) struct workqueue_struct *wq; for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; @@ -3622,11 +3622,11 @@ void freeze_workqueues_begin(void) pwq->max_active = 0; } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } } - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } /** @@ -3647,7 +3647,7 @@ bool freeze_workqueues_busy(void) unsigned int cpu; bool busy = false; - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(!workqueue_freezing); @@ -3671,7 +3671,7 @@ bool freeze_workqueues_busy(void) } } out_unlock: - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); return busy; } @@ -3688,7 +3688,7 @@ void thaw_workqueues(void) { unsigned int cpu; - spin_lock(&workqueue_lock); + spin_lock_irq(&workqueue_lock); if (!workqueue_freezing) goto out_unlock; @@ -3698,7 +3698,7 @@ void thaw_workqueues(void) struct workqueue_struct *wq; for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; @@ -3716,13 +3716,13 @@ void thaw_workqueues(void) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } } workqueue_freezing = false; out_unlock: - spin_unlock(&workqueue_lock); + spin_unlock_irq(&workqueue_lock); } #endif /* CONFIG_FREEZER */ -- cgit v1.2.3 From e904e6c2668bba78497c660aec812ca3f77f4ef9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: workqueue: introduce kmem_cache for pool_workqueues pool_workqueues need to be aligned to 1 << WORK_STRUCT_FLAG_BITS as the lower bits of work->data are used for flags when they're pointing to pool_workqueues. Due to historical reasons, unbound pool_workqueues are allocated using kzalloc() with sufficient buffer area for alignment and aligned manually. The original pointer is stored at the end which free_pwqs() retrieves when freeing it. There's no reason for this hackery anymore. Set alignment of struct pool_workqueue to 1 << WORK_STRUCT_FLAG_BITS, add kmem_cache for pool_workqueues with proper alignment and replace the hacky alloc and free implementation with plain kmem_cache_zalloc/free(). In case WORK_STRUCT_FLAG_BITS gets shrunk too much and makes fields of pool_workqueues misaligned, trigger WARN if the alignment of struct pool_workqueue becomes smaller than that of long long. Note that assertion on IS_ALIGNED() is removed from alloc_pwqs(). We already have another one in pwq init loop in __alloc_workqueue_key(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 43 ++++++++++++------------------------------- 1 file changed, 12 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c585d0ebd353..f9e2ad9a3205 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,7 +169,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ -}; +} __aligned(1 << WORK_STRUCT_FLAG_BITS); /* * Structure used to wait for workqueue flush. @@ -233,6 +233,8 @@ struct workqueue_struct { char name[]; /* I: workqueue name */ }; +static struct kmem_cache *pwq_cache; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -3096,34 +3098,11 @@ int keventd_up(void) static int alloc_pwqs(struct workqueue_struct *wq) { - /* - * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. - * Make sure that the alignment isn't lower than that of - * unsigned long long. - */ - const size_t size = sizeof(struct pool_workqueue); - const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, - __alignof__(unsigned long long)); - if (!(wq->flags & WQ_UNBOUND)) - wq->pool_wq.pcpu = __alloc_percpu(size, align); - else { - void *ptr; - - /* - * Allocate enough room to align pwq and put an extra - * pointer at the end pointing back to the originally - * allocated pointer which will be used for free. - */ - ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); - if (ptr) { - wq->pool_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->pool_wq.single + 1) = ptr; - } - } + wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); + else + wq->pool_wq.single = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); return wq->pool_wq.v ? 0 : -ENOMEM; } @@ -3131,10 +3110,8 @@ static void free_pwqs(struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->pool_wq.pcpu); - else if (wq->pool_wq.single) { - /* the pointer to free is stored right after the pwq */ - kfree(*(void **)(wq->pool_wq.single + 1)); - } + else + kmem_cache_free(pwq_cache, wq->pool_wq.single); } static int wq_clamp_max_active(int max_active, unsigned int flags, @@ -3734,6 +3711,10 @@ static int __init init_workqueues(void) BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < WORK_CPU_END * NR_STD_WORKER_POOLS); + WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); -- cgit v1.2.3 From 30cdf2496d8ac2ef94b9b85f1891cf069490c8c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:57 -0700 Subject: workqueue: add workqueue_struct->pwqs list Add workqueue_struct->pwqs list and chain all pool_workqueues belonging to a workqueue there. This will be used to implement generic pool_workqueue iteration and handle multiple pool_workqueues for the scheduled unbound pools with custom attributes. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f9e2ad9a3205..8634fc9d52d2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -169,6 +169,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ + struct list_head pwqs_node; /* I: node on wq->pwqs */ } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* @@ -212,6 +213,7 @@ struct workqueue_struct { struct pool_workqueue *single; unsigned long v; } pool_wq; /* I: pwq's */ + struct list_head pwqs; /* I: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ @@ -3096,14 +3098,32 @@ int keventd_up(void) return system_wq != NULL; } -static int alloc_pwqs(struct workqueue_struct *wq) +static int alloc_and_link_pwqs(struct workqueue_struct *wq) { - if (!(wq->flags & WQ_UNBOUND)) + int cpu; + + if (!(wq->flags & WQ_UNBOUND)) { wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); - else - wq->pool_wq.single = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!wq->pool_wq.pcpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = get_pwq(cpu, wq); - return wq->pool_wq.v ? 0 : -ENOMEM; + list_add_tail(&pwq->pwqs_node, &wq->pwqs); + } + } else { + struct pool_workqueue *pwq; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) + return -ENOMEM; + + wq->pool_wq.single = pwq; + list_add_tail(&pwq->pwqs_node, &wq->pwqs); + } + + return 0; } static void free_pwqs(struct workqueue_struct *wq) @@ -3165,13 +3185,14 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); - if (alloc_pwqs(wq) < 0) + if (alloc_and_link_pwqs(wq) < 0) goto err; for_each_pwq_cpu(cpu, wq) { -- cgit v1.2.3 From 49e3cf44df0663a521aa71e7667c52a9dbd0fce9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:58 -0700 Subject: workqueue: replace for_each_pwq_cpu() with for_each_pwq() Introduce for_each_pwq() which iterates all pool_workqueues of a workqueue using the recently added workqueue->pwqs list and replace for_each_pwq_cpu() usages with it. This is primarily to remove the single unbound CPU assumption from pwq iteration for the scheduled unbound pools with custom attributes support which would introduce multiple unbound pwqs per workqueue; however, it also simplifies iterator users. Note that pwq->pool initialization is moved to alloc_and_link_pwqs() as that now is the only place which is explicitly handling the two pwq types. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 53 ++++++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8634fc9d52d2..2db1532b09dc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -273,12 +273,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, return WORK_CPU_END; } -static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) -{ - return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); -} - /* * CPU iterators * @@ -289,8 +283,6 @@ static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, * * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_pwq_cpu() : possible CPUs for bound workqueues, - * WORK_CPU_UNBOUND for unbound workqueues */ #define for_each_wq_cpu(cpu) \ for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ @@ -302,10 +294,13 @@ static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) -#define for_each_pwq_cpu(cpu, wq) \ - for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) +/** + * for_each_pwq - iterate through all pool_workqueues of the specified workqueue + * @pwq: iteration cursor + * @wq: the target workqueue + */ +#define for_each_pwq(pwq, wq) \ + list_for_each_entry((pwq), &(wq)->pwqs, pwqs_node) #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -2505,15 +2500,14 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; - unsigned int cpu; + struct pool_workqueue *pwq; if (flush_color >= 0) { WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); @@ -2712,7 +2706,7 @@ EXPORT_SYMBOL_GPL(flush_workqueue); void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; - unsigned int cpu; + struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much @@ -2726,8 +2720,7 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { bool drained; spin_lock_irq(&pwq->pool->lock); @@ -3100,6 +3093,7 @@ int keventd_up(void) static int alloc_and_link_pwqs(struct workqueue_struct *wq) { + bool highpri = wq->flags & WQ_HIGHPRI; int cpu; if (!(wq->flags & WQ_UNBOUND)) { @@ -3110,6 +3104,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = get_pwq(cpu, wq); + pwq->pool = get_std_worker_pool(cpu, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); } } else { @@ -3120,6 +3115,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return -ENOMEM; wq->pool_wq.single = pwq; + pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); } @@ -3154,7 +3150,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, { va_list args, args1; struct workqueue_struct *wq; - unsigned int cpu; + struct pool_workqueue *pwq; size_t namelen; /* determine namelen, allocate wq and format name */ @@ -3195,11 +3191,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err; - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - + for_each_pwq(pwq, wq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); pwq->wq = wq; pwq->flush_color = -1; pwq->max_active = max_active; @@ -3234,8 +3227,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, spin_lock_irq(&workqueue_lock); if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_pwq_cpu(cpu, wq) - get_pwq(cpu, wq)->max_active = 0; + for_each_pwq(pwq, wq) + pwq->max_active = 0; list_add(&wq->list, &workqueues); @@ -3261,14 +3254,13 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int cpu; + struct pool_workqueue *pwq; /* drain it before proceeding with destruction */ drain_workqueue(wq); /* sanity checks */ - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) @@ -3330,7 +3322,7 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { - unsigned int cpu; + struct pool_workqueue *pwq; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); @@ -3338,8 +3330,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; spin_lock(&pool->lock); -- cgit v1.2.3 From 171169695555831e8cc41dbc1783700868631ea5 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:58 -0700 Subject: workqueue: introduce for_each_pool() With the scheduled unbound pools with custom attributes, there will be multiple unbound pools, so it wouldn't be able to use for_each_wq_cpu() + for_each_std_worker_pool() to iterate through all pools. Introduce for_each_pool() which iterates through all pools using worker_pool_idr and use it instead of for_each_wq_cpu() + for_each_std_worker_pool() combination in freeze_workqueues_begin(). Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2db1532b09dc..55494e3f9f3b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -294,6 +294,14 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, (cpu) < WORK_CPU_END; \ (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) +/** + * for_each_pool - iterate through all worker_pools in the system + * @pool: iteration cursor + * @id: integer used for iteration + */ +#define for_each_pool(pool, id) \ + idr_for_each_entry(&worker_pool_idr, pool, id) + /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor @@ -3586,33 +3594,31 @@ EXPORT_SYMBOL_GPL(work_on_cpu); */ void freeze_workqueues_begin(void) { - unsigned int cpu; + struct worker_pool *pool; + int id; spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; + for_each_pool(pool, id) { struct workqueue_struct *wq; - for_each_std_worker_pool(pool, cpu) { - spin_lock(&pool->lock); - - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; + spin_lock(&pool->lock); - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + WARN_ON_ONCE(pool->flags & POOL_FREEZING); + pool->flags |= POOL_FREEZING; - if (pwq && pwq->pool == pool && - (wq->flags & WQ_FREEZABLE)) - pwq->max_active = 0; - } + list_for_each_entry(wq, &workqueues, list) { + struct pool_workqueue *pwq = get_pwq(pool->cpu, wq); - spin_unlock(&pool->lock); + if (pwq && pwq->pool == pool && + (wq->flags & WQ_FREEZABLE)) + pwq->max_active = 0; } + + spin_unlock(&pool->lock); } spin_unlock_irq(&workqueue_lock); -- cgit v1.2.3 From 24b8a84718ed28a51b452881612c267ba3f2b263 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:58 -0700 Subject: workqueue: restructure pool / pool_workqueue iterations in freeze/thaw functions The three freeze/thaw related functions - freeze_workqueues_begin(), freeze_workqueues_busy() and thaw_workqueues() - need to iterate through all pool_workqueues of all freezable workqueues. They did it by first iterating pools and then visiting all pwqs (pool_workqueues) of all workqueues and process it if its pwq->pool matches the current pool. This is rather backwards and done this way partly because workqueue didn't have fitting iteration helpers and partly to avoid the number of lock operations on pool->lock. Workqueue now has fitting iterators and the locking operation overhead isn't anything to worry about - those locks are unlikely to be contended and the same CPU visiting the same set of locks multiple times isn't expensive. Restructure the three functions such that the flow better matches the logical steps and pwq iteration is done using for_each_pwq() inside workqueue iteration. * freeze_workqueues_begin(): Setting of FREEZING is moved into a separate for_each_pool() iteration. pwq iteration for clearing max_active is updated as described above. * freeze_workqueues_busy(): pwq iteration updated as described above. * thaw_workqueues(): The single for_each_wq_cpu() iteration is broken into three discrete steps - clearing FREEZING, restoring max_active, and kicking workers. The first and last steps use for_each_pool() and the second step uses pwq iteration described above. This makes the code easier to understand and removes the use of for_each_wq_cpu() for walking pwqs, which can't support multiple unbound pwqs which will be needed to implement unbound workqueues with custom attributes. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 87 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 55494e3f9f3b..8942cc74d83b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3595,6 +3595,8 @@ EXPORT_SYMBOL_GPL(work_on_cpu); void freeze_workqueues_begin(void) { struct worker_pool *pool; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; int id; spin_lock_irq(&workqueue_lock); @@ -3602,23 +3604,24 @@ void freeze_workqueues_begin(void) WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; + /* set FREEZING */ for_each_pool(pool, id) { - struct workqueue_struct *wq; - spin_lock(&pool->lock); - WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; + spin_unlock(&pool->lock); + } - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(pool->cpu, wq); + /* suppress further executions by setting max_active to zero */ + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; - if (pwq && pwq->pool == pool && - (wq->flags & WQ_FREEZABLE)) - pwq->max_active = 0; + for_each_pwq(pwq, wq) { + spin_lock(&pwq->pool->lock); + pwq->max_active = 0; + spin_unlock(&pwq->pool->lock); } - - spin_unlock(&pool->lock); } spin_unlock_irq(&workqueue_lock); @@ -3639,25 +3642,22 @@ void freeze_workqueues_begin(void) */ bool freeze_workqueues_busy(void) { - unsigned int cpu; bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; spin_lock_irq(&workqueue_lock); WARN_ON_ONCE(!workqueue_freezing); - for_each_wq_cpu(cpu) { - struct workqueue_struct *wq; + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (!pwq || !(wq->flags & WQ_FREEZABLE)) - continue; - + for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; @@ -3681,40 +3681,43 @@ out_unlock: */ void thaw_workqueues(void) { - unsigned int cpu; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + struct worker_pool *pool; + int id; spin_lock_irq(&workqueue_lock); if (!workqueue_freezing) goto out_unlock; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; - struct workqueue_struct *wq; - - for_each_std_worker_pool(pool, cpu) { - spin_lock(&pool->lock); - - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (!pwq || pwq->pool != pool || - !(wq->flags & WQ_FREEZABLE)) - continue; - - /* restore max_active and repopulate worklist */ - pwq_set_max_active(pwq, wq->saved_max_active); - } + /* clear FREEZING */ + for_each_pool(pool, id) { + spin_lock(&pool->lock); + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); + pool->flags &= ~POOL_FREEZING; + spin_unlock(&pool->lock); + } - wake_up_worker(pool); + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; - spin_unlock(&pool->lock); + for_each_pwq(pwq, wq) { + spin_lock(&pwq->pool->lock); + pwq_set_max_active(pwq, wq->saved_max_active); + spin_unlock(&pwq->pool->lock); } } + /* kick workers */ + for_each_pool(pool, id) { + spin_lock(&pool->lock); + wake_up_worker(pool); + spin_unlock(&pool->lock); + } + workqueue_freezing = false; out_unlock: spin_unlock_irq(&workqueue_lock); -- cgit v1.2.3 From 493a1724fef9a3e931d9199f1a19e358e526a6e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: workqueue: add wokrqueue_struct->maydays list to replace mayday cpu iterators Similar to how pool_workqueue iteration used to be, raising and servicing mayday requests is based on CPU numbers. It's hairy because cpumask_t may not be able to handle WORK_CPU_UNBOUND and cpumasks are assumed to be always set on UP. This is ugly and can't handle multiple unbound pools to be added for unbound workqueues w/ custom attributes. Add workqueue_struct->maydays. When a pool_workqueue needs rescuing, it gets chained on the list through pool_workqueue->mayday_node and rescuer_thread() consumes the list until it's empty. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 77 ++++++++++++++++++++---------------------------------- 1 file changed, 28 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8942cc74d83b..26c67c76b6c5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -170,6 +170,7 @@ struct pool_workqueue { int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* I: node on wq->pwqs */ + struct list_head mayday_node; /* W: node on wq->maydays */ } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* @@ -181,27 +182,6 @@ struct wq_flusher { struct completion done; /* flush completion */ }; -/* - * All cpumasks are assumed to be always set on UP and thus can't be - * used to determine whether there's something to be done. - */ -#ifdef CONFIG_SMP -typedef cpumask_var_t mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) \ - cpumask_test_and_set_cpu((cpu), (mask)) -#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) -#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) -#define free_mayday_mask(mask) free_cpumask_var((mask)) -#else -typedef unsigned long mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) -#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) -#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) -#define alloc_mayday_mask(maskp, gfp) true -#define free_mayday_mask(mask) do { } while (0) -#endif - /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: @@ -224,7 +204,7 @@ struct workqueue_struct { struct list_head flusher_queue; /* F: flush waiters */ struct list_head flusher_overflow; /* F: flush overflow list */ - mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct list_head maydays; /* W: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* W: drain in progress */ @@ -1850,23 +1830,21 @@ static void idle_worker_timeout(unsigned long __pool) spin_unlock_irq(&pool->lock); } -static bool send_mayday(struct work_struct *work) +static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; - unsigned int cpu; + + lockdep_assert_held(&workqueue_lock); if (!(wq->flags & WQ_RESCUER)) - return false; + return; /* mayday mayday mayday */ - cpu = pwq->pool->cpu; - /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ - if (cpu == WORK_CPU_UNBOUND) - cpu = 0; - if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + if (list_empty(&pwq->mayday_node)) { + list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); - return true; + } } static void pool_mayday_timeout(unsigned long __pool) @@ -1874,7 +1852,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&pool->lock); + spin_lock_irq(&workqueue_lock); /* for wq->maydays */ + spin_lock(&pool->lock); if (need_to_create_worker(pool)) { /* @@ -1887,7 +1866,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); + spin_unlock_irq(&workqueue_lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2336,8 +2316,6 @@ static int rescuer_thread(void *__rescuer) struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; - bool is_unbound = wq->flags & WQ_UNBOUND; - unsigned int cpu; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2355,18 +2333,19 @@ repeat: return 0; } - /* - * See whether any cpu is asking for help. Unbounded - * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. - */ - for_each_mayday_cpu(cpu, wq->mayday_mask) { - unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct pool_workqueue *pwq = get_pwq(tcpu, wq); + /* see whether any pwq is asking for help */ + spin_lock_irq(&workqueue_lock); + + while (!list_empty(&wq->maydays)) { + struct pool_workqueue *pwq = list_first_entry(&wq->maydays, + struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); - mayday_clear_cpu(cpu, wq->mayday_mask); + list_del_init(&pwq->mayday_node); + + spin_unlock_irq(&workqueue_lock); /* migrate to the target cpu if possible */ worker_maybe_bind_and_lock(pool); @@ -2392,9 +2371,12 @@ repeat: wake_up_worker(pool); rescuer->pool = NULL; - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); + spin_lock(&workqueue_lock); } + spin_unlock_irq(&workqueue_lock); + /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); @@ -3192,6 +3174,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); + INIT_LIST_HEAD(&wq->maydays); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3205,14 +3188,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, pwq->flush_color = -1; pwq->max_active = max_active; INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->mayday_node); } if (flags & WQ_RESCUER) { struct worker *rescuer; - if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) - goto err; - wq->rescuer = rescuer = alloc_worker(); if (!rescuer) goto err; @@ -3246,7 +3227,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, err: if (wq) { free_pwqs(wq); - free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); kfree(wq); } @@ -3289,7 +3269,6 @@ void destroy_workqueue(struct workqueue_struct *wq) if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); - free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); } -- cgit v1.2.3 From d84ff0512f1bfc0d8c864efadb4523fce68919cc Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: workqueue: consistently use int for @cpu variables Workqueue is mixing unsigned int and int for @cpu variables. There's no point in using unsigned int for cpus - many of cpu related APIs take int anyway. Consistently use int for @cpu variables so that we can use negative values to mark special ones. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- include/linux/workqueue.h | 6 +++--- kernel/workqueue.c | 24 +++++++++++------------- kernel/workqueue_internal.h | 5 ++--- 3 files changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5bd030f630a9..899be6636d20 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -435,7 +435,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); -extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq); +extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); /* @@ -466,12 +466,12 @@ static inline bool __deprecated flush_delayed_work_sync(struct delayed_work *dwo } #ifndef CONFIG_SMP -static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg); +long work_on_cpu(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 26c67c76b6c5..73c5f68065b5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -124,7 +124,7 @@ enum { struct worker_pool { spinlock_t lock; /* the pool lock */ - unsigned int cpu; /* I: the associated cpu */ + int cpu; /* I: the associated cpu */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -467,8 +467,7 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static struct pool_workqueue *get_pwq(unsigned int cpu, - struct workqueue_struct *wq) +static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) @@ -730,7 +729,7 @@ static void wake_up_worker(struct worker_pool *pool) * CONTEXT: * spin_lock_irq(rq->lock) */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_waking_up(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task); @@ -755,8 +754,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) * RETURNS: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -1159,7 +1157,7 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, +static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; @@ -1714,7 +1712,7 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->cpu != WORK_CPU_UNBOUND) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(pool->cpu), - "kworker/%u:%d%s", pool->cpu, id, pri); + "kworker/%d:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, "kworker/u:%d%s", id, pri); @@ -3345,7 +3343,7 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * RETURNS: * %true if congested, %false otherwise. */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq = get_pwq(cpu, wq); @@ -3461,7 +3459,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct worker_pool *pool; switch (action & ~CPU_TASKS_FROZEN) { @@ -3507,7 +3505,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct work_struct unbind_work; switch (action & ~CPU_TASKS_FROZEN) { @@ -3547,7 +3545,7 @@ static void work_for_cpu_fn(struct work_struct *work) * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; @@ -3705,7 +3703,7 @@ out_unlock: static int __init init_workqueues(void) { - unsigned int cpu; + int cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index f9c887731e2b..f116f071d919 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -59,8 +59,7 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ -- cgit v1.2.3 From 420c0ddb1f205a3511b766d0dfee2cc87ed9dae0 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:29:59 -0700 Subject: workqueue: remove workqueue_struct->pool_wq.single workqueue->pool_wq union is used to point either to percpu pwqs (pool_workqueues) or single unbound pwq. As the first pwq can be accessed via workqueue->pwqs list, there's no reason for the single pointer anymore. Use list_first_entry(workqueue->pwqs) to access the unbound pwq and drop workqueue->pool_wq.single pointer and the pool_wq union. It simplifies the code and eases implementing multiple unbound pools w/ custom attributes. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 73c5f68065b5..acee7b525d51 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -188,11 +188,7 @@ struct wq_flusher { */ struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ - union { - struct pool_workqueue __percpu *pcpu; - struct pool_workqueue *single; - unsigned long v; - } pool_wq; /* I: pwq's */ + struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* I: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ @@ -471,9 +467,11 @@ static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->pool_wq.pcpu, cpu); - } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->pool_wq.single; + return per_cpu_ptr(wq->cpu_pwqs, cpu); + } else if (likely(cpu == WORK_CPU_UNBOUND)) { + return list_first_entry(&wq->pwqs, struct pool_workqueue, + pwqs_node); + } return NULL; } @@ -3085,8 +3083,8 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) int cpu; if (!(wq->flags & WQ_UNBOUND)) { - wq->pool_wq.pcpu = alloc_percpu(struct pool_workqueue); - if (!wq->pool_wq.pcpu) + wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); + if (!wq->cpu_pwqs) return -ENOMEM; for_each_possible_cpu(cpu) { @@ -3102,7 +3100,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) if (!pwq) return -ENOMEM; - wq->pool_wq.single = pwq; pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); } @@ -3113,9 +3110,10 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) static void free_pwqs(struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->pool_wq.pcpu); - else - kmem_cache_free(pwq_cache, wq->pool_wq.single); + free_percpu(wq->cpu_pwqs); + else if (!list_empty(&wq->pwqs)) + kmem_cache_free(pwq_cache, list_first_entry(&wq->pwqs, + struct pool_workqueue, pwqs_node)); } static int wq_clamp_max_active(int max_active, unsigned int flags, -- cgit v1.2.3 From 7fb98ea79cecb14fc1735544146be06fdb1944c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: replace get_pwq() with explicit per_cpu_ptr() accesses and first_pwq() get_pwq() takes @cpu, which can also be WORK_CPU_UNBOUND, and @wq and returns the matching pwq (pool_workqueue). We want to move away from using @cpu for identifying pools and pwqs for unbound pools with custom attributes and there is only one user - workqueue_congested() - which makes use of the WQ_UNBOUND conditional in get_pwq(). All other users already know whether they're dealing with a per-cpu or unbound workqueue. Replace get_pwq() with explicit per_cpu_ptr(wq->cpu_pwqs, cpu) for per-cpu workqueues and first_pwq() for unbound ones, and open-code WQ_UNBOUND conditional in workqueue_congested(). Note that this makes workqueue_congested() behave sligntly differently when @cpu other than WORK_CPU_UNBOUND is specified. It ignores @cpu for unbound workqueues and always uses the first pwq instead of oopsing. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index acee7b525d51..577ac719eaec 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -463,16 +463,9 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } -static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq) +static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->cpu_pwqs, cpu); - } else if (likely(cpu == WORK_CPU_UNBOUND)) { - return list_first_entry(&wq->pwqs, struct pool_workqueue, - pwqs_node); - } - return NULL; + return list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); } static unsigned int work_color_to_flags(int color) @@ -1191,7 +1184,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, * work needs to be queued on that cpu to guarantee * non-reentrancy. */ - pwq = get_pwq(cpu, wq); + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); last_pool = get_work_pool(work); if (last_pool && last_pool != pwq->pool) { @@ -1202,7 +1195,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, worker = find_worker_executing_work(last_pool, work); if (worker && worker->current_pwq->wq == wq) { - pwq = get_pwq(last_pool->cpu, wq); + pwq = per_cpu_ptr(wq->cpu_pwqs, last_pool->cpu); } else { /* meh... not running there, queue here */ spin_unlock(&last_pool->lock); @@ -1212,7 +1205,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, spin_lock(&pwq->pool->lock); } } else { - pwq = get_pwq(WORK_CPU_UNBOUND, wq); + pwq = first_pwq(wq); spin_lock(&pwq->pool->lock); } @@ -1650,7 +1643,7 @@ static void rebind_workers(struct worker_pool *pool) else wq = system_wq; - insert_work(get_pwq(pool->cpu, wq), rebind_work, + insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, worker->scheduled.next, work_color_to_flags(WORK_NO_COLOR)); } @@ -3088,7 +3081,8 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return -ENOMEM; for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct pool_workqueue *pwq = + per_cpu_ptr(wq->cpu_pwqs, cpu); pwq->pool = get_std_worker_pool(cpu, highpri); list_add_tail(&pwq->pwqs_node, &wq->pwqs); @@ -3343,7 +3337,12 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct pool_workqueue *pwq; + + if (!(wq->flags & WQ_UNBOUND)) + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + else + pwq = first_pwq(wq); return !list_empty(&pwq->delayed_works); } -- cgit v1.2.3 From 76af4d936153afec176c53378e6ba8671e7e237d Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: update synchronization rules on workqueue->pwqs Make workqueue->pwqs protected by workqueue_lock for writes and sched-RCU protected for reads. Lockdep assertions are added to for_each_pwq() and first_pwq() and all their users are converted to either hold workqueue_lock or disable preemption/irq. alloc_and_link_pwqs() is updated to use list_add_tail_rcu() for consistency which isn't strictly necessary as the workqueue isn't visible. destroy_workqueue() isn't updated to sched-RCU release pwqs. This is okay as the workqueue should have on users left by that point. The locking is superflous at this point. This is to help implementation of unbound pools/pwqs with custom attributes. This patch doesn't introduce any behavior changes. v2: Updated for_each_pwq() use if/else for the hidden assertion statement instead of just if as suggested by Lai. This avoids confusing the following else clause. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 87 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 70 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 577ac719eaec..e060ff2bc20c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -42,6 +42,7 @@ #include <linux/lockdep.h> #include <linux/idr.h> #include <linux/hashtable.h> +#include <linux/rculist.h> #include "workqueue_internal.h" @@ -118,6 +119,8 @@ enum { * F: wq->flush_mutex protected. * * W: workqueue_lock protected. + * + * R: workqueue_lock protected for writes. Sched-RCU protected for reads. */ /* struct worker is defined in workqueue_internal.h */ @@ -169,7 +172,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ - struct list_head pwqs_node; /* I: node on wq->pwqs */ + struct list_head pwqs_node; /* R: node on wq->pwqs */ struct list_head mayday_node; /* W: node on wq->maydays */ } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -189,7 +192,7 @@ struct wq_flusher { struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ - struct list_head pwqs; /* I: all pwqs of this wq */ + struct list_head pwqs; /* R: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ @@ -227,6 +230,11 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#define assert_rcu_or_wq_lock() \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&workqueue_lock), \ + "sched RCU or workqueue lock should be held") + #define for_each_std_worker_pool(pool, cpu) \ for ((pool) = &std_worker_pools(cpu)[0]; \ (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) @@ -282,9 +290,18 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor * @wq: the target workqueue + * + * This must be called either with workqueue_lock held or sched RCU read + * locked. If the pwq needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pwq stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. */ #define for_each_pwq(pwq, wq) \ - list_for_each_entry((pwq), &(wq)->pwqs, pwqs_node) + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + if (({ assert_rcu_or_wq_lock(); false; })) { } \ + else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -463,9 +480,19 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) return &pools[highpri]; } +/** + * first_pwq - return the first pool_workqueue of the specified workqueue + * @wq: the target workqueue + * + * This must be called either with workqueue_lock held or sched RCU read + * locked. If the pwq needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pwq stays online. + */ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - return list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); + assert_rcu_or_wq_lock(); + return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, + pwqs_node); } static unsigned int work_color_to_flags(int color) @@ -2486,10 +2513,12 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, atomic_set(&wq->nr_pwqs_to_flush, 1); } + local_irq_disable(); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2506,9 +2535,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); } + local_irq_enable(); + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); @@ -2699,12 +2730,14 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); + local_irq_disable(); + for_each_pwq(pwq, wq) { bool drained; - spin_lock_irq(&pwq->pool->lock); + spin_lock(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock_irq(&pwq->pool->lock); + spin_unlock(&pwq->pool->lock); if (drained) continue; @@ -2713,13 +2746,17 @@ reflush: (flush_cnt % 100 == 0 && flush_cnt <= 1000)) pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", wq->name, flush_cnt); + + local_irq_enable(); goto reflush; } - spin_lock_irq(&workqueue_lock); + spin_lock(&workqueue_lock); if (!--wq->nr_drainers) wq->flags &= ~WQ_DRAINING; - spin_unlock_irq(&workqueue_lock); + spin_unlock(&workqueue_lock); + + local_irq_enable(); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3085,7 +3122,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) per_cpu_ptr(wq->cpu_pwqs, cpu); pwq->pool = get_std_worker_pool(cpu, highpri); - list_add_tail(&pwq->pwqs_node, &wq->pwqs); + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } } else { struct pool_workqueue *pwq; @@ -3095,7 +3132,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return -ENOMEM; pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); - list_add_tail(&pwq->pwqs_node, &wq->pwqs); + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } return 0; @@ -3172,6 +3209,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err; + local_irq_disable(); for_each_pwq(pwq, wq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); pwq->wq = wq; @@ -3180,6 +3218,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); } + local_irq_enable(); if (flags & WQ_RESCUER) { struct worker *rescuer; @@ -3237,24 +3276,32 @@ void destroy_workqueue(struct workqueue_struct *wq) /* drain it before proceeding with destruction */ drain_workqueue(wq); + spin_lock_irq(&workqueue_lock); + /* sanity checks */ for_each_pwq(pwq, wq) { int i; - for (i = 0; i < WORK_NR_COLORS; i++) - if (WARN_ON(pwq->nr_in_flight[i])) + for (i = 0; i < WORK_NR_COLORS; i++) { + if (WARN_ON(pwq->nr_in_flight[i])) { + spin_unlock_irq(&workqueue_lock); return; + } + } + if (WARN_ON(pwq->nr_active) || - WARN_ON(!list_empty(&pwq->delayed_works))) + WARN_ON(!list_empty(&pwq->delayed_works))) { + spin_unlock_irq(&workqueue_lock); return; + } } /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - spin_lock_irq(&workqueue_lock); list_del(&wq->list); + spin_unlock_irq(&workqueue_lock); if (wq->flags & WQ_RESCUER) { @@ -3338,13 +3385,19 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); bool workqueue_congested(int cpu, struct workqueue_struct *wq) { struct pool_workqueue *pwq; + bool ret; + + preempt_disable(); if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else pwq = first_pwq(wq); - return !list_empty(&pwq->delayed_works); + ret = !list_empty(&pwq->delayed_works); + preempt_enable(); + + return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); -- cgit v1.2.3 From fa1b54e69bc6c04674c9bb96a6cfa8b2c9f44771 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: update synchronization rules on worker_pool_idr Make worker_pool_idr protected by workqueue_lock for writes and sched-RCU protected for reads. Lockdep assertions are added to for_each_pool() and get_work_pool() and all their users are converted to either hold workqueue_lock or disable preemption/irq. worker_pool_assign_id() is updated to hold workqueue_lock when allocating a pool ID. As idr_get_new() always performs RCU-safe assignment, this is enough on the writer side. As standard pools are never destroyed, there's nothing to do on that side. The locking is superflous at this point. This is to help implementation of unbound pools/pwqs with custom attributes. This patch doesn't introduce any behavior changes. v2: Updated for_each_pwq() use if/else for the hidden assertion statement instead of just if as suggested by Lai. This avoids confusing the following else clause. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 71 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e060ff2bc20c..46381490f496 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -282,9 +282,18 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor * @id: integer used for iteration + * + * This must be called either with workqueue_lock held or sched RCU read + * locked. If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. */ #define for_each_pool(pool, id) \ - idr_for_each_entry(&worker_pool_idr, pool, id) + idr_for_each_entry(&worker_pool_idr, pool, id) \ + if (({ assert_rcu_or_wq_lock(); false; })) { } \ + else /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue @@ -432,8 +441,10 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_std_worker_pools); static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; -/* idr of all pools */ -static DEFINE_MUTEX(worker_pool_idr_mutex); +/* + * idr of all pools. Modifications are protected by workqueue_lock. Read + * accesses are protected by sched-RCU protected. + */ static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); @@ -456,21 +467,16 @@ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; - mutex_lock(&worker_pool_idr_mutex); - idr_pre_get(&worker_pool_idr, GFP_KERNEL); - ret = idr_get_new(&worker_pool_idr, pool, &pool->id); - mutex_unlock(&worker_pool_idr_mutex); + do { + if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) + return -ENOMEM; - return ret; -} + spin_lock_irq(&workqueue_lock); + ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + spin_unlock_irq(&workqueue_lock); + } while (ret == -EAGAIN); -/* - * Lookup worker_pool by id. The idr currently is built during boot and - * never modified. Don't worry about locking for now. - */ -static struct worker_pool *worker_pool_by_id(int pool_id) -{ - return idr_find(&worker_pool_idr, pool_id); + return ret; } static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) @@ -586,13 +592,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Return the worker_pool @work was last associated with. %NULL if none. + * + * Pools are created and destroyed under workqueue_lock, and allows read + * access under sched-RCU read lock. As such, this function should be + * called under workqueue_lock or with preemption disabled. + * + * All fields of the returned pool are accessible as long as the above + * mentioned locking is in effect. If the returned pool needs to be used + * beyond the critical section, the caller is responsible for ensuring the + * returned pool is and stays online. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - struct worker_pool *pool; int pool_id; + assert_rcu_or_wq_lock(); + if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool; @@ -601,9 +617,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; - pool = worker_pool_by_id(pool_id); - WARN_ON_ONCE(!pool); - return pool; + return idr_find(&worker_pool_idr, pool_id); } /** @@ -2767,11 +2781,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) struct pool_workqueue *pwq; might_sleep(); + + local_irq_disable(); pool = get_work_pool(work); - if (!pool) + if (!pool) { + local_irq_enable(); return false; + } - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -3414,19 +3432,22 @@ EXPORT_SYMBOL_GPL(workqueue_congested); */ unsigned int work_busy(struct work_struct *work) { - struct worker_pool *pool = get_work_pool(work); + struct worker_pool *pool; unsigned long flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; + local_irq_save(flags); + pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + spin_lock(&pool->lock); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock(&pool->lock); } + local_irq_restore(flags); return ret; } -- cgit v1.2.3 From 34a06bd6b6fa92ccd9d3e6866b6cb91264c3cd20 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: replace POOL_MANAGING_WORKERS flag with worker_pool->manager_arb POOL_MANAGING_WORKERS is used to synchronize the manager role. Synchronizing among workers doesn't need blocking and that's why it's implemented as a flag. It got converted to a mutex a while back to add blocking wait from CPU hotplug path - 6037315269 ("workqueue: use mutex for global_cwq manager exclusion"). Later it turned out that synchronization among workers and cpu hotplug need to be done separately. Eventually, POOL_MANAGING_WORKERS is restored and workqueue->manager_mutex got morphed into workqueue->assoc_mutex - 552a37e936 ("workqueue: restore POOL_MANAGING_WORKERS") and b2eb83d123 ("workqueue: rename manager_mutex to assoc_mutex"). Now, we're gonna need to be able to lock out managers from destroy_workqueue() to support multiple unbound pools with custom attributes making it again necessary to be able to block on the manager role. This patch replaces POOL_MANAGING_WORKERS with worker_pool->manager_arb. This patch doesn't introduce any behavior changes. v2: s/manager_mutex/manager_arb/ Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 46381490f496..16f7f8d79d35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -64,7 +64,6 @@ enum { * create_worker() is in progress. */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -145,6 +144,7 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ + struct mutex manager_arb; /* manager arbitration */ struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ @@ -706,7 +706,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = pool->flags & POOL_MANAGING_WORKERS; + bool managing = mutex_is_locked(&pool->manager_arb); int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -2029,19 +2029,17 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (pool->flags & POOL_MANAGING_WORKERS) + if (!mutex_trylock(&pool->manager_arb)) return ret; - pool->flags |= POOL_MANAGING_WORKERS; - /* * To simplify both worker management and CPU hotplug, hold off * management while hotplug is in progress. CPU hotplug path can't - * grab %POOL_MANAGING_WORKERS to achieve this because that can - * lead to idle worker depletion (all become busy thinking someone - * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->assoc_mutex to synchronize - * manager against CPU hotplug. + * grab @pool->manager_arb to achieve this because that can lead to + * idle worker depletion (all become busy thinking someone else is + * managing) which in turn can result in deadlock under extreme + * circumstances. Use @pool->assoc_mutex to synchronize manager + * against CPU hotplug. * * assoc_mutex would always be free unless CPU hotplug is in * progress. trylock first without dropping @pool->lock. @@ -2077,8 +2075,8 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - pool->flags &= ~POOL_MANAGING_WORKERS; mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_arb); return ret; } @@ -3806,6 +3804,7 @@ static int __init init_workqueues(void) setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); + mutex_init(&pool->manager_arb); mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); -- cgit v1.2.3 From 4e1a1f9a051b4c9a2821a2a0f7f4a27c701fba51 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: separate out init_worker_pool() from init_workqueues() This will be used to implement unbound pools with custom attributes. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16f7f8d79d35..094f16668e1b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3123,6 +3123,26 @@ int keventd_up(void) return system_wq != NULL; } +static void init_worker_pool(struct worker_pool *pool) +{ + spin_lock_init(&pool->lock); + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; + + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); + + mutex_init(&pool->manager_arb); + mutex_init(&pool->assoc_mutex); + ida_init(&pool->worker_ida); +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3790,23 +3810,8 @@ static int __init init_workqueues(void) struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { - spin_lock_init(&pool->lock); + init_worker_pool(pool); pool->cpu = cpu; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; - - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); - - mutex_init(&pool->manager_arb); - mutex_init(&pool->assoc_mutex); - ida_init(&pool->worker_ida); /* alloc pool ID */ BUG_ON(worker_pool_assign_id(pool)); -- cgit v1.2.3 From 7a4e344c5675eefbde93ed9a98ef45e0e4957bc2 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:00 -0700 Subject: workqueue: introduce workqueue_attrs Introduce struct workqueue_attrs which carries worker attributes - currently the nice level and allowed cpumask along with helper routines alloc_workqueue_attrs() and free_workqueue_attrs(). Each worker_pool now carries ->attrs describing the attributes of its workers. All functions dealing with cpumask and nice level of workers are updated to follow worker_pool->attrs instead of determining them from other characteristics of the worker_pool, and init_workqueues() is updated to set worker_pool->attrs appropriately for all standard pools. Note that create_worker() is updated to always perform set_user_nice() and use set_cpus_allowed_ptr() combined with manual assertion of PF_THREAD_BOUND instead of kthread_bind(). This simplifies handling random attributes without affecting the outcome. This patch doesn't introduce any behavior changes. v2: Missing cpumask_var_t definition caused build failure on some archs. linux/cpumask.h included. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: kbuild test robot <fengguang.wu@intel.com> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- include/linux/workqueue.h | 13 ++++++ kernel/workqueue.c | 103 ++++++++++++++++++++++++++++++++++++---------- 2 files changed, 94 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 899be6636d20..00c1b9ba8252 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -11,6 +11,7 @@ #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> +#include <linux/cpumask.h> struct workqueue_struct; @@ -115,6 +116,15 @@ struct delayed_work { int cpu; }; +/* + * A struct for workqueue attributes. This can be used to change + * attributes of an unbound workqueue. + */ +struct workqueue_attrs { + int nice; /* nice level */ + cpumask_var_t cpumask; /* allowed CPUs */ +}; + static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); @@ -399,6 +409,9 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, extern void destroy_workqueue(struct workqueue_struct *wq); +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); +void free_workqueue_attrs(struct workqueue_attrs *attrs); + extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 094f16668e1b..b0d3cbb83f63 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ struct ida worker_ida; /* L: for worker IDs */ + struct workqueue_attrs *attrs; /* I: worker attributes */ + /* * The current concurrency level. As it's likely to be accessed * from other CPUs during try_to_wake_up(), put it in a separate @@ -1566,14 +1568,13 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, pool->attrs->cpumask); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; if (task_cpu(current) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(pool->cpu))) + cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) return true; spin_unlock_irq(&pool->lock); @@ -1679,7 +1680,7 @@ static void rebind_workers(struct worker_pool *pool) * wq doesn't really matter but let's keep @worker->pool * and @pwq->pool consistent for sanity. */ - if (std_worker_pool_pri(worker->pool)) + if (worker->pool->attrs->nice < 0) wq = system_highpri_wq; else wq = system_wq; @@ -1721,7 +1722,7 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = std_worker_pool_pri(pool) ? "H" : ""; + const char *pri = pool->attrs->nice < 0 ? "H" : ""; struct worker *worker = NULL; int id = -1; @@ -1751,24 +1752,23 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; - if (std_worker_pool_pri(pool)) - set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + set_user_nice(worker->task, pool->attrs->nice); + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); /* - * Determine CPU binding of the new worker depending on - * %POOL_DISASSOCIATED. The caller is responsible for ensuring the - * flag remains stable across this function. See the comments - * above the flag definition for details. - * - * As an unbound worker may later become a regular one if CPU comes - * online, make sure every worker has %PF_THREAD_BOUND set. + * %PF_THREAD_BOUND is used to prevent userland from meddling with + * cpumask of workqueue workers. This is an abuse. We need + * %PF_NO_SETAFFINITY. */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - kthread_bind(worker->task, pool->cpu); - } else { - worker->task->flags |= PF_THREAD_BOUND; + worker->task->flags |= PF_THREAD_BOUND; + + /* + * The caller is responsible for ensuring %POOL_DISASSOCIATED + * remains stable across this function. See the comments above the + * flag definition for details. + */ + if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - } return worker; fail: @@ -3123,7 +3123,52 @@ int keventd_up(void) return system_wq != NULL; } -static void init_worker_pool(struct worker_pool *pool) +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free + * + * Undo alloc_workqueue_attrs(). + */ +void free_workqueue_attrs(struct workqueue_attrs *attrs) +{ + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } +} + +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. Returns NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +{ + struct workqueue_attrs *attrs; + + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_setall(attrs->cpumask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; +} + +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Returns 0 on success, -errno on failure. + */ +static int init_worker_pool(struct worker_pool *pool) { spin_lock_init(&pool->lock); pool->flags |= POOL_DISASSOCIATED; @@ -3141,6 +3186,11 @@ static void init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) @@ -3792,7 +3842,8 @@ out_unlock: static int __init init_workqueues(void) { - int cpu; + int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int i, cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < @@ -3809,10 +3860,18 @@ static int __init init_workqueues(void) for_each_wq_cpu(cpu) { struct worker_pool *pool; + i = 0; for_each_std_worker_pool(pool, cpu) { - init_worker_pool(pool); + BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; + if (cpu != WORK_CPU_UNBOUND) + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + else + cpumask_setall(pool->attrs->cpumask); + + pool->attrs->nice = std_nice[i++]; + /* alloc pool ID */ BUG_ON(worker_pool_assign_id(pool)); } -- cgit v1.2.3 From 29c91e9912bed7060df6116af90286500f5a700d Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: workqueue: implement attribute-based unbound worker_pool management This patch makes unbound worker_pools reference counted and dynamically created and destroyed as workqueues needing them come and go. All unbound worker_pools are hashed on unbound_pool_hash which is keyed by the content of worker_pool->attrs. When an unbound workqueue is allocated, get_unbound_pool() is called with the attributes of the workqueue. If there already is a matching worker_pool, the reference count is bumped and the pool is returned. If not, a new worker_pool with matching attributes is created and returned. When an unbound workqueue is destroyed, put_unbound_pool() is called which decrements the reference count of the associated worker_pool. If the refcnt reaches zero, the worker_pool is destroyed in sched-RCU safe way. Note that the standard unbound worker_pools - normal and highpri ones with no specific cpumask affinity - are no longer created explicitly during init_workqueues(). init_workqueues() only initializes workqueue_attrs to be used for standard unbound pools - unbound_std_wq_attrs[]. The pools are spawned on demand as workqueues are created. v2: - Comment added to init_worker_pool() explaining that @pool should be in a condition which can be passed to put_unbound_pool() even on failure. - pool->refcnt reaching zero and the pool being removed from unbound_pool_hash should be dynamic. pool->refcnt is converted to int from atomic_t and now manipulated inside workqueue_lock. - Removed an incorrect sanity check on nr_idle in put_unbound_pool() which may trigger spuriously. All changes were suggested by Lai Jiangshan. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 224 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b0d3cbb83f63..3fe2c79bf166 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,6 +41,7 @@ #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> +#include <linux/jhash.h> #include <linux/hashtable.h> #include <linux/rculist.h> @@ -80,6 +81,7 @@ enum { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ + UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ @@ -149,6 +151,8 @@ struct worker_pool { struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ + struct hlist_node hash_node; /* R: unbound_pool_hash node */ + int refcnt; /* refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -156,6 +160,12 @@ struct worker_pool { * cacheline. */ atomic_t nr_running ____cacheline_aligned_in_smp; + + /* + * Destruction of pool is sched-RCU protected to allow dereferences + * from get_work_pool(). + */ + struct rcu_head rcu; } ____cacheline_aligned_in_smp; /* @@ -218,6 +228,11 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; +/* hash of all unbound pools keyed by pool->attrs */ +static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); + +static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -1742,7 +1757,7 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool = pool; worker->id = id; - if (pool->cpu != WORK_CPU_UNBOUND) + if (pool->cpu >= 0) worker->task = kthread_create_on_node(worker_thread, worker, cpu_to_node(pool->cpu), "kworker/%d:%d%s", pool->cpu, id, pri); @@ -3161,16 +3176,68 @@ fail: return NULL; } +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) +{ + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); +} + +/* + * Hacky implementation of jhash of bitmaps which only considers the + * specified number of bits. We probably want a proper implementation in + * include/linux/jhash.h. + */ +static u32 jhash_bitmap(const unsigned long *bitmap, int bits, u32 hash) +{ + int nr_longs = bits / BITS_PER_LONG; + int nr_leftover = bits % BITS_PER_LONG; + unsigned long leftover = 0; + + if (nr_longs) + hash = jhash(bitmap, nr_longs * sizeof(long), hash); + if (nr_leftover) { + bitmap_copy(&leftover, bitmap + nr_longs, nr_leftover); + hash = jhash(&leftover, sizeof(long), hash); + } + return hash; +} + +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) +{ + u32 hash = 0; + + hash = jhash_1word(attrs->nice, hash); + hash = jhash_bitmap(cpumask_bits(attrs->cpumask), nr_cpu_ids, hash); + return hash; +} + +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) +{ + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; +} + /** * init_worker_pool - initialize a newly zalloc'd worker_pool * @pool: worker_pool to initialize * * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. - * Returns 0 on success, -errno on failure. + * Returns 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. */ static int init_worker_pool(struct worker_pool *pool) { spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); @@ -3187,12 +3254,136 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->assoc_mutex); ida_init(&pool->worker_ida); + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; + + /* shouldn't fail above this point */ pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); if (!pool->attrs) return -ENOMEM; return 0; } +static void rcu_free_pool(struct rcu_head *rcu) +{ + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + + ida_destroy(&pool->worker_ida); + free_workqueue_attrs(pool->attrs); + kfree(pool); +} + +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. + */ +static void put_unbound_pool(struct worker_pool *pool) +{ + struct worker *worker; + + spin_lock_irq(&workqueue_lock); + if (--pool->refcnt) { + spin_unlock_irq(&workqueue_lock); + return; + } + + /* sanity checks */ + if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + WARN_ON(!list_empty(&pool->worklist))) { + spin_unlock_irq(&workqueue_lock); + return; + } + + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); + + spin_unlock_irq(&workqueue_lock); + + /* lock out manager and destroy all workers */ + mutex_lock(&pool->manager_arb); + spin_lock_irq(&pool->lock); + + while ((worker = first_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->manager_arb); + + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); + + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); +} + +/** + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get + * + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. On failure, returns NULL. + */ +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +{ + static DEFINE_MUTEX(create_mutex); + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + struct worker *worker; + + mutex_lock(&create_mutex); + + /* do we already have a matching pool? */ + spin_lock_irq(&workqueue_lock); + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + goto out_unlock; + } + } + spin_unlock_irq(&workqueue_lock); + + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + copy_workqueue_attrs(pool->attrs, attrs); + + if (worker_pool_assign_id(pool) < 0) + goto fail; + + /* create and start the initial worker */ + worker = create_worker(pool); + if (!worker) + goto fail; + + spin_lock_irq(&pool->lock); + start_worker(worker); + spin_unlock_irq(&pool->lock); + + /* install */ + spin_lock_irq(&workqueue_lock); + hash_add(unbound_pool_hash, &pool->hash_node, hash); +out_unlock: + spin_unlock_irq(&workqueue_lock); + mutex_unlock(&create_mutex); + return pool; +fail: + mutex_unlock(&create_mutex); + if (pool) + put_unbound_pool(pool); + return NULL; +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3217,7 +3408,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) if (!pwq) return -ENOMEM; - pwq->pool = get_std_worker_pool(WORK_CPU_UNBOUND, highpri); + pwq->pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); + if (!pwq->pool) { + kmem_cache_free(pwq_cache, pwq); + return -ENOMEM; + } + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } @@ -3395,6 +3591,15 @@ void destroy_workqueue(struct workqueue_struct *wq) kfree(wq->rescuer); } + /* + * We're the sole accessor of @wq at this point. Directly access + * the first pwq and put its pool. + */ + if (wq->flags & WQ_UNBOUND) { + pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, + pwqs_node); + put_unbound_pool(pwq->pool); + } free_pwqs(wq); kfree(wq); } @@ -3857,19 +4062,14 @@ static int __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); /* initialize CPU pools */ - for_each_wq_cpu(cpu) { + for_each_possible_cpu(cpu) { struct worker_pool *pool; i = 0; for_each_std_worker_pool(pool, cpu) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; - - if (cpu != WORK_CPU_UNBOUND) - cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); - else - cpumask_setall(pool->attrs->cpumask); - + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; /* alloc pool ID */ @@ -3878,14 +4078,13 @@ static int __init init_workqueues(void) } /* create the initial worker */ - for_each_online_wq_cpu(cpu) { + for_each_online_cpu(cpu) { struct worker_pool *pool; for_each_std_worker_pool(pool, cpu) { struct worker *worker; - if (cpu != WORK_CPU_UNBOUND) - pool->flags &= ~POOL_DISASSOCIATED; + pool->flags &= ~POOL_DISASSOCIATED; worker = create_worker(pool); BUG_ON(!worker); @@ -3895,6 +4094,18 @@ static int __init init_workqueues(void) } } + /* create default unbound wq attrs */ + for (i = 0; i < NR_STD_WORKER_POOLS; i++) { + struct workqueue_attrs *attrs; + + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + + attrs->nice = std_nice[i]; + cpumask_setall(attrs->cpumask); + + unbound_std_wq_attrs[i] = attrs; + } + system_wq = alloc_workqueue("events", 0, 0); system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); system_long_wq = alloc_workqueue("events_long", 0, 0); -- cgit v1.2.3 From 7a62c2c87e3bc174fe4b9e9720e148427510fcfb Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: workqueue: remove unbound_std_worker_pools[] and related helpers Workqueue no longer makes use of unbound_std_worker_pools[]. All unbound worker_pools are created dynamically and there's nothing special about the standard ones. With unbound_std_worker_pools[] unused, workqueue no longer has places where it needs to treat the per-cpu pools-cpu and unbound pools together. Remove unbound_std_worker_pools[] and the helpers wrapping it to present unified per-cpu and unbound standard worker_pools. * for_each_std_worker_pool() now only walks through per-cpu pools. * for_each[_online]_wq_cpu() which don't have any users left are removed. * std_worker_pools() and std_worker_pool_pri() are unused and removed. * get_std_worker_pool() is removed. Its only user - alloc_and_link_pwqs() - only used it for per-cpu pools anyway. Open code per_cpu access in alloc_and_link_pwqs() instead. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 66 +++++------------------------------------------------- 1 file changed, 6 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3fe2c79bf166..7642bb7b70ee 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -253,48 +253,13 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); "sched RCU or workqueue lock should be held") #define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &std_worker_pools(cpu)[0]; \ - (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) + for ((pool) = &per_cpu(cpu_std_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(cpu_std_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) #define for_each_busy_worker(worker, i, pool) \ hash_for_each(pool->busy_hash, i, worker, hentry) -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) -{ - if (cpu < nr_cpu_ids) { - if (sw & 1) { - cpu = cpumask_next(cpu, mask); - if (cpu < nr_cpu_ids) - return cpu; - } - if (sw & 2) - return WORK_CPU_UNBOUND; - } - return WORK_CPU_END; -} - -/* - * CPU iterators - * - * An extra cpu number is defined using an invalid cpu number - * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to for_each_*_cpu() - * iterators but also considers the unbound CPU. - * - * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - */ -#define for_each_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) - -#define for_each_online_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) - /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor @@ -456,7 +421,6 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_std_worker_pools); -static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; /* * idr of all pools. Modifications are protected by workqueue_lock. Read @@ -466,19 +430,6 @@ static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); -static struct worker_pool *std_worker_pools(int cpu) -{ - if (cpu != WORK_CPU_UNBOUND) - return per_cpu(cpu_std_worker_pools, cpu); - else - return unbound_std_worker_pools; -} - -static int std_worker_pool_pri(struct worker_pool *pool) -{ - return pool - std_worker_pools(pool->cpu); -} - /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { @@ -496,13 +447,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) -{ - struct worker_pool *pools = std_worker_pools(cpu); - - return &pools[highpri]; -} - /** * first_pwq - return the first pool_workqueue of the specified workqueue * @wq: the target workqueue @@ -3397,8 +3341,10 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + struct worker_pool *cpu_pools = + per_cpu(cpu_std_worker_pools, cpu); - pwq->pool = get_std_worker_pool(cpu, highpri); + pwq->pool = &cpu_pools[highpri]; list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } } else { -- cgit v1.2.3 From f02ae73aaa4f285199683862ac59972877a11c5d Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: workqueue: drop "std" from cpu_std_worker_pools and for_each_std_worker_pool() All per-cpu pools are standard, so there's no need to use both "cpu" and "std" and for_each_std_worker_pool() is confusing in that it can be used only for per-cpu pools. * s/cpu_std_worker_pools/cpu_worker_pools/ * s/for_each_std_worker_pool()/for_each_cpu_worker_pool()/ Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7642bb7b70ee..2c5073214774 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -252,9 +252,9 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); lockdep_is_held(&workqueue_lock), \ "sched RCU or workqueue lock should be held") -#define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &per_cpu(cpu_std_worker_pools, cpu)[0]; \ - (pool) < &per_cpu(cpu_std_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ +#define for_each_cpu_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) #define for_each_busy_worker(worker, i, pool) \ @@ -420,7 +420,7 @@ static bool workqueue_freezing; /* W: have wqs started freezing? */ * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_std_worker_pools); + cpu_worker_pools); /* * idr of all pools. Modifications are protected by workqueue_lock. Read @@ -3342,7 +3342,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct pool_workqueue *pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); struct worker_pool *cpu_pools = - per_cpu(cpu_std_worker_pools, cpu); + per_cpu(cpu_worker_pools, cpu); pwq->pool = &cpu_pools[highpri]; list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); @@ -3694,7 +3694,7 @@ static void wq_unbind_fn(struct work_struct *work) struct worker *worker; int i; - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); mutex_lock(&pool->assoc_mutex); @@ -3737,7 +3737,7 @@ static void wq_unbind_fn(struct work_struct *work) * unbound chain execution of pending work items if other workers * didn't already. */ - for_each_std_worker_pool(pool, cpu) + for_each_cpu_worker_pool(pool, cpu) atomic_set(&pool->nr_running, 0); } @@ -3754,7 +3754,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { struct worker *worker; if (pool->nr_workers) @@ -3772,7 +3772,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&pool->assoc_mutex); spin_lock_irq(&pool->lock); @@ -4012,7 +4012,7 @@ static int __init init_workqueues(void) struct worker_pool *pool; i = 0; - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); @@ -4027,7 +4027,7 @@ static int __init init_workqueues(void) for_each_online_cpu(cpu) { struct worker_pool *pool; - for_each_std_worker_pool(pool, cpu) { + for_each_cpu_worker_pool(pool, cpu) { struct worker *worker; pool->flags &= ~POOL_DISASSOCIATED; -- cgit v1.2.3 From ac6104cdf87cc162b0a0d78280d1dcb9752e25bb Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: workqueue: add pool ID to the names of unbound kworkers There are gonna be multiple unbound pools. Include pool ID in the name of unbound kworkers. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2c5073214774..a8b86f7b6e34 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1707,7 +1707,8 @@ static struct worker *create_worker(struct worker_pool *pool) "kworker/%d:%d%s", pool->cpu, id, pri); else worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d%s", id, pri); + "kworker/u%d:%d%s", + pool->id, id, pri); if (IS_ERR(worker->task)) goto fail; -- cgit v1.2.3 From 493008a8e475771a2126e0ce95a73e35b371d277 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:03 -0700 Subject: workqueue: drop WQ_RESCUER and test workqueue->rescuer for NULL instead WQ_RESCUER is superflous. WQ_MEM_RECLAIM indicates that the user wants a rescuer and testing wq->rescuer for NULL can answer whether a given workqueue has a rescuer or not. Drop WQ_RESCUER and test wq->rescuer directly. This will help simplifying __alloc_workqueue_key() failure path by allowing it to use destroy_workqueue() on a partially constructed workqueue, which in turn will help implementing dynamic management of pool_workqueues. While at it, clear wq->rescuer after freeing it in destroy_workqueue(). This is a precaution as scheduled changes will make destruction more complex. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- include/linux/workqueue.h | 1 - kernel/workqueue.c | 22 ++++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 00c1b9ba8252..c270b4eedf16 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -295,7 +295,6 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ - WQ_RESCUER = 1 << 7, /* internal: workqueue has rescuer */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a8b86f7b6e34..7ff2b9c5cc3a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1827,7 +1827,7 @@ static void send_mayday(struct work_struct *work) lockdep_assert_held(&workqueue_lock); - if (!(wq->flags & WQ_RESCUER)) + if (!wq->rescuer) return; /* mayday mayday mayday */ @@ -2285,7 +2285,7 @@ sleep: * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each - * workqueue which has WQ_RESCUER set. + * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of @@ -2769,7 +2769,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) lock_map_acquire(&pwq->wq->lockdep_map); else lock_map_acquire_read(&pwq->wq->lockdep_map); @@ -3412,13 +3412,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, va_end(args); va_end(args1); - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) - flags |= WQ_RESCUER; - max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3449,7 +3442,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } local_irq_enable(); - if (flags & WQ_RESCUER) { + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; wq->rescuer = rescuer = alloc_worker(); @@ -3533,9 +3530,10 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock_irq(&workqueue_lock); - if (wq->flags & WQ_RESCUER) { + if (wq->rescuer) { kthread_stop(wq->rescuer->task); kfree(wq->rescuer); + wq->rescuer = NULL; } /* -- cgit v1.2.3 From d2c1d40487bb1884be085c187233084f80df052d Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: restructure __alloc_workqueue_key() * Move initialization and linking of pool_workqueues into init_and_link_pwq(). * Make the failure path use destroy_workqueue() once pool_workqueue initialization succeeds. These changes are to prepare for dynamic management of pool_workqueues and don't introduce any functional changes. While at it, convert list_del(&wq->list) to list_del_init() as a precaution as scheduled changes will make destruction more complex. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 67 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7ff2b9c5cc3a..5ac846e0085e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3329,6 +3329,23 @@ fail: return NULL; } +/* initialize @pwq which interfaces with @pool for @wq and link it in */ +static void init_and_link_pwq(struct pool_workqueue *pwq, + struct workqueue_struct *wq, + struct worker_pool *pool) +{ + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + + pwq->pool = pool; + pwq->wq = wq; + pwq->flush_color = -1; + pwq->max_active = wq->saved_max_active; + INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->mayday_node); + + list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3345,23 +3362,23 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - pwq->pool = &cpu_pools[highpri]; - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + init_and_link_pwq(pwq, wq, &cpu_pools[highpri]); } } else { struct pool_workqueue *pwq; + struct worker_pool *pool; pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) return -ENOMEM; - pwq->pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); - if (!pwq->pool) { + pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); + if (!pool) { kmem_cache_free(pwq_cache, pwq); return -ENOMEM; } - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + init_and_link_pwq(pwq, wq, pool); } return 0; @@ -3406,7 +3423,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); if (!wq) - goto err; + return NULL; vsnprintf(wq->name, namelen, fmt, args1); va_end(args); @@ -3429,18 +3446,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, INIT_LIST_HEAD(&wq->list); if (alloc_and_link_pwqs(wq) < 0) - goto err; - - local_irq_disable(); - for_each_pwq(pwq, wq) { - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - pwq->wq = wq; - pwq->flush_color = -1; - pwq->max_active = max_active; - INIT_LIST_HEAD(&pwq->delayed_works); - INIT_LIST_HEAD(&pwq->mayday_node); - } - local_irq_enable(); + goto err_free_wq; /* * Workqueues which may be used during memory reclaim should @@ -3449,16 +3455,19 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - wq->rescuer = rescuer = alloc_worker(); + rescuer = alloc_worker(); if (!rescuer) - goto err; + goto err_destroy; rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - if (IS_ERR(rescuer->task)) - goto err; + if (IS_ERR(rescuer->task)) { + kfree(rescuer); + goto err_destroy; + } + wq->rescuer = rescuer; rescuer->task->flags |= PF_THREAD_BOUND; wake_up_process(rescuer->task); } @@ -3479,12 +3488,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, spin_unlock_irq(&workqueue_lock); return wq; -err: - if (wq) { - free_pwqs(wq); - kfree(wq->rescuer); - kfree(wq); - } + +err_free_wq: + kfree(wq); + return NULL; +err_destroy: + destroy_workqueue(wq); return NULL; } EXPORT_SYMBOL_GPL(__alloc_workqueue_key); @@ -3526,7 +3535,7 @@ void destroy_workqueue(struct workqueue_struct *wq) * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - list_del(&wq->list); + list_del_init(&wq->list); spin_unlock_irq(&workqueue_lock); -- cgit v1.2.3 From 8864b4e59f7945a636eeb27671f10486149be6e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: implement get/put_pwq() Add pool_workqueue->refcnt along with get/put_pwq(). Both per-cpu and unbound pwqs have refcnts and any work item inserted on a pwq increments the refcnt which is dropped when the work item finishes. For per-cpu pwqs the base ref is never dropped and destroy_workqueue() frees the pwqs as before. For unbound ones, destroy_workqueue() simply drops the base ref on the first pwq. When the refcnt reaches zero, pwq_unbound_release_workfn() is scheduled on system_wq, which unlinks the pwq, puts the associated pool and frees the pwq and wq as necessary. This needs to be done from a work item as put_pwq() needs to be protected by pool->lock but release can't happen with the lock held - e.g. put_unbound_pool() involves blocking operations. Unbound pool->locks are marked with lockdep subclas 1 as put_pwq() will schedule the release work item on system_wq while holding the unbound pool's lock and triggers recursive locking warning spuriously. This will be used to implement dynamic creation and destruction of unbound pwqs. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 137 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 114 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ac846e0085e..7dd8e7bcec51 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -179,6 +179,7 @@ struct pool_workqueue { struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ + int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ @@ -186,6 +187,15 @@ struct pool_workqueue { struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* R: node on wq->pwqs */ struct list_head mayday_node; /* W: node on wq->maydays */ + + /* + * Release of unbound pwq is punted to system_wq. See put_pwq() + * and pwq_unbound_release_workfn() for details. pool_workqueue + * itself is also sched-RCU protected so that the first pwq can be + * determined without grabbing workqueue_lock. + */ + struct work_struct unbound_release_work; + struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); /* @@ -939,6 +949,45 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } +/** + * get_pwq - get an extra reference on the specified pool_workqueue + * @pwq: pool_workqueue to get + * + * Obtain an extra reference on @pwq. The caller should guarantee that + * @pwq has positive refcnt and be holding the matching pool->lock. + */ +static void get_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + WARN_ON_ONCE(pwq->refcnt <= 0); + pwq->refcnt++; +} + +/** + * put_pwq - put a pool_workqueue reference + * @pwq: pool_workqueue to put + * + * Drop a reference of @pwq. If its refcnt reaches zero, schedule its + * destruction. The caller should be holding the matching pool->lock. + */ +static void put_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + if (likely(--pwq->refcnt)) + return; + if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) + return; + /* + * @pwq can't be released under pool->lock, bounce to + * pwq_unbound_release_workfn(). This never recurses on the same + * pool->lock as this path is taken only for unbound workqueues and + * the release work item is scheduled on a per-cpu workqueue. To + * avoid lockdep warning, unbound pool->locks are given lockdep + * subclass of 1 in get_unbound_pool(). + */ + schedule_work(&pwq->unbound_release_work); +} + static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -970,9 +1019,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { - /* ignore uncolored works */ + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) - return; + goto out_put; pwq->nr_in_flight[color]--; @@ -985,11 +1034,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) - return; + goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) - return; + goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; @@ -1000,6 +1049,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); +out_put: + put_pwq(pwq); } /** @@ -1122,6 +1173,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); + get_pwq(pwq); /* * Ensure either worker_sched_deactivated() sees the above @@ -3301,6 +3353,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); if (worker_pool_assign_id(pool) < 0) @@ -3329,7 +3382,41 @@ fail: return NULL; } -/* initialize @pwq which interfaces with @pool for @wq and link it in */ +static void rcu_free_pwq(struct rcu_head *rcu) +{ + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); +} + +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. + */ +static void pwq_unbound_release_workfn(struct work_struct *work) +{ + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; + + spin_lock_irq(&workqueue_lock); + list_del_rcu(&pwq->pwqs_node); + spin_unlock_irq(&workqueue_lock); + + put_unbound_pool(pool); + call_rcu_sched(&pwq->rcu, rcu_free_pwq); + + /* + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Free it. + */ + if (list_empty(&wq->pwqs)) + kfree(wq); +} + static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) @@ -3339,9 +3426,11 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; + pwq->refcnt = 1; pwq->max_active = wq->saved_max_active; INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); + INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); } @@ -3384,15 +3473,6 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) return 0; } -static void free_pwqs(struct workqueue_struct *wq) -{ - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else if (!list_empty(&wq->pwqs)) - kmem_cache_free(pwq_cache, list_first_entry(&wq->pwqs, - struct pool_workqueue, pwqs_node)); -} - static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { @@ -3524,7 +3604,8 @@ void destroy_workqueue(struct workqueue_struct *wq) } } - if (WARN_ON(pwq->nr_active) || + if (WARN_ON(pwq->refcnt > 1) || + WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { spin_unlock_irq(&workqueue_lock); return; @@ -3545,17 +3626,27 @@ void destroy_workqueue(struct workqueue_struct *wq) wq->rescuer = NULL; } - /* - * We're the sole accessor of @wq at this point. Directly access - * the first pwq and put its pool. - */ - if (wq->flags & WQ_UNBOUND) { + if (!(wq->flags & WQ_UNBOUND)) { + /* + * The base ref is never dropped on per-cpu pwqs. Directly + * free the pwqs and wq. + */ + free_percpu(wq->cpu_pwqs); + kfree(wq); + } else { + /* + * We're the sole accessor of @wq at this point. Directly + * access the first pwq and put the base ref. As both pwqs + * and pools are sched-RCU protected, the lock operations + * are safe. @wq will be freed when the last pwq is + * released. + */ pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); - put_unbound_pool(pwq->pool); + spin_lock_irq(&pwq->pool->lock); + put_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); } - free_pwqs(wq); - kfree(wq); } EXPORT_SYMBOL_GPL(destroy_workqueue); -- cgit v1.2.3 From 75ccf5950f828d53aebfd3a852283a00abf2c5bf Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: prepare flush_workqueue() for dynamic creation and destrucion of unbound pool_workqueues Unbound pwqs (pool_workqueues) will be dynamically created and destroyed with the scheduled unbound workqueue w/ custom attributes support. This patch synchronizes pwq linking and unlinking against flush_workqueue() so that its operation isn't disturbed by pwqs coming and going. Linking and unlinking a pwq into wq->pwqs is now protected also by wq->flush_mutex and a new pwq's work_color is initialized to wq->work_color during linking. This ensures that pwqs changes don't disturb flush_workqueue() in progress and the new pwq's work coloring stays in sync with the rest of the workqueue. flush_mutex during unlinking isn't strictly necessary but it's simpler to do it anyway. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7dd8e7bcec51..e933979678e5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,6 +122,9 @@ enum { * W: workqueue_lock protected. * * R: workqueue_lock protected for writes. Sched-RCU protected for reads. + * + * FR: wq->flush_mutex and workqueue_lock protected for writes. Sched-RCU + * protected for reads. */ /* struct worker is defined in workqueue_internal.h */ @@ -185,7 +188,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ - struct list_head pwqs_node; /* R: node on wq->pwqs */ + struct list_head pwqs_node; /* FR: node on wq->pwqs */ struct list_head mayday_node; /* W: node on wq->maydays */ /* @@ -214,7 +217,7 @@ struct wq_flusher { struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ - struct list_head pwqs; /* R: all pwqs of this wq */ + struct list_head pwqs; /* FR: all pwqs of this wq */ struct list_head list; /* W: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ @@ -3402,9 +3405,16 @@ static void pwq_unbound_release_workfn(struct work_struct *work) if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; + /* + * Unlink @pwq. Synchronization against flush_mutex isn't strictly + * necessary on release but do it anyway. It's easier to verify + * and consistent with the linking path. + */ + mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); list_del_rcu(&pwq->pwqs_node); spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq->flush_mutex); put_unbound_pool(pool); call_rcu_sched(&pwq->rcu, rcu_free_pwq); @@ -3432,7 +3442,18 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + /* + * Link @pwq and set the matching work_color. This is synchronized + * with flush_mutex to avoid confusing flush_workqueue(). + */ + mutex_lock(&wq->flush_mutex); + spin_lock_irq(&workqueue_lock); + + pwq->work_color = wq->work_color; list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + + spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq->flush_mutex); } static int alloc_and_link_pwqs(struct workqueue_struct *wq) -- cgit v1.2.3 From c9178087acd71b4ea010ea48e147cf66952d2da9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: perform non-reentrancy test when queueing to unbound workqueues too Because per-cpu workqueues have multiple pwqs (pool_workqueues) to serve the CPUs, to guarantee that a single work item isn't queued on one pwq while still executing another, __queue_work() takes a look at the previous pool the target work item was on and if it's still executing there, queue the work item on that pool. To support changing workqueue_attrs on the fly, unbound workqueues too will have multiple pwqs and thus need non-reentrancy test when queueing. This patch modifies __queue_work() such that the reentrancy test is performed regardless of the workqueue type. per_cpu_ptr(wq->cpu_pwqs, cpu) used to be used to determine the matching pwq for the last pool. This can't be used for unbound workqueues and is replaced with worker->current_pwq which also happens to be simpler. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e933979678e5..16fb6747276a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1209,6 +1209,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; + struct worker_pool *last_pool; struct list_head *worklist; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1228,41 +1229,36 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; - /* determine the pwq to use */ + /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) { - struct worker_pool *last_pool; - if (cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); - - /* - * It's multi cpu. If @work was previously on a different - * cpu, it might still be running there, in which case the - * work needs to be queued on that cpu to guarantee - * non-reentrancy. - */ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - last_pool = get_work_pool(work); + } else { + pwq = first_pwq(wq); + } - if (last_pool && last_pool != pwq->pool) { - struct worker *worker; + /* + * If @work was previously on a different pool, it might still be + * running there, in which case the work needs to be queued on that + * pool to guarantee non-reentrancy. + */ + last_pool = get_work_pool(work); + if (last_pool && last_pool != pwq->pool) { + struct worker *worker; - spin_lock(&last_pool->lock); + spin_lock(&last_pool->lock); - worker = find_worker_executing_work(last_pool, work); + worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_pwq->wq == wq) { - pwq = per_cpu_ptr(wq->cpu_pwqs, last_pool->cpu); - } else { - /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); - } + if (worker && worker->current_pwq->wq == wq) { + pwq = worker->current_pwq; } else { + /* meh... not running there, queue here */ + spin_unlock(&last_pool->lock); spin_lock(&pwq->pool->lock); } } else { - pwq = first_pwq(wq); spin_lock(&pwq->pool->lock); } -- cgit v1.2.3 From 9e8cd2f5898ab6710ad81f4583fada08bf8049a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: implement apply_workqueue_attrs() Implement apply_workqueue_attrs() which applies workqueue_attrs to the specified unbound workqueue by creating a new pwq (pool_workqueue) linked to worker_pool with the specified attributes. A new pwq is linked at the head of wq->pwqs instead of tail and __queue_work() verifies that the first unbound pwq has positive refcnt before choosing it for the actual queueing. This is to cover the case where creation of a new pwq races with queueing. As base ref on a pwq won't be dropped without making another pwq the first one, __queue_work() is guaranteed to make progress and not add work item to a dead pwq. init_and_link_pwq() is updated to return the last first pwq the new pwq replaced, which is put by apply_workqueue_attrs(). Note that apply_workqueue_attrs() is almost identical to unbound pwq part of alloc_and_link_pwqs(). The only difference is that there is no previous first pwq. apply_workqueue_attrs() is implemented to handle such cases and replaces unbound pwq handling in alloc_and_link_pwqs(). Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- include/linux/workqueue.h | 2 ++ kernel/workqueue.c | 91 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 73 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c270b4eedf16..e152394fa7eb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -410,6 +410,8 @@ extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask); void free_workqueue_attrs(struct workqueue_attrs *attrs); +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 16fb6747276a..2a67fbbd192c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1228,7 +1228,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, if (unlikely(wq->flags & WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; - +retry: /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) { if (cpu == WORK_CPU_UNBOUND) @@ -1262,6 +1262,25 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, spin_lock(&pwq->pool->lock); } + /* + * pwq is determined and locked. For unbound pools, we could have + * raced with pwq release and it could already be dead. If its + * refcnt is zero, repeat pwq selection. Note that pwqs never die + * without another pwq replacing it as the first pwq or while a + * work item is executing on it, so the retying is guaranteed to + * make forward-progress. + */ + if (unlikely(!pwq->refcnt)) { + if (wq->flags & WQ_UNBOUND) { + spin_unlock(&pwq->pool->lock); + cpu_relax(); + goto retry; + } + /* oops */ + WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", + wq->name, cpu); + } + /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); @@ -3425,7 +3444,8 @@ static void pwq_unbound_release_workfn(struct work_struct *work) static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, - struct worker_pool *pool) + struct worker_pool *pool, + struct pool_workqueue **p_last_pwq) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); @@ -3445,13 +3465,58 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); + if (p_last_pwq) + *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; - list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs); + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&workqueue_lock); mutex_unlock(&wq->flush_mutex); } +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the + * current attributes, a new pwq is created and made the first pwq which + * will serve all new work items. Older pwqs are released as in-flight + * work items finish. Note that a work item which repeatedly requeues + * itself back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on + * failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct pool_workqueue *pwq, *last_pwq; + struct worker_pool *pool; + + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) + return -ENOMEM; + + pool = get_unbound_pool(attrs); + if (!pool) { + kmem_cache_free(pwq_cache, pwq); + return -ENOMEM; + } + + init_and_link_pwq(pwq, wq, pool, &last_pwq); + if (last_pwq) { + spin_lock_irq(&last_pwq->pool->lock); + put_pwq(last_pwq); + spin_unlock_irq(&last_pwq->pool->lock); + } + + return 0; +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3468,26 +3533,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - init_and_link_pwq(pwq, wq, &cpu_pools[highpri]); + init_and_link_pwq(pwq, wq, &cpu_pools[highpri], NULL); } + return 0; } else { - struct pool_workqueue *pwq; - struct worker_pool *pool; - - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) - return -ENOMEM; - - pool = get_unbound_pool(unbound_std_wq_attrs[highpri]); - if (!pool) { - kmem_cache_free(pwq_cache, pwq); - return -ENOMEM; - } - - init_and_link_pwq(pwq, wq, pool); + return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } - - return 0; } static int wq_clamp_max_active(int max_active, unsigned int flags, -- cgit v1.2.3 From 618b01eb426dd2d73a4b5e5ebc6379e4eee3b123 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: make it clear that WQ_DRAINING is an internal flag We're gonna add another internal WQ flag. Let's make the distinction clear. Prefix WQ_DRAINING with __ and move it to bit 16. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- include/linux/workqueue.h | 2 +- kernel/workqueue.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index e152394fa7eb..1751ec4c47c9 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -294,7 +294,7 @@ enum { WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ - WQ_DRAINING = 1 << 6, /* internal: workqueue is draining */ + __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2a67fbbd192c..590f4d048ec7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1225,7 +1225,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DRAINING) && + if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; retry: @@ -2763,11 +2763,11 @@ void drain_workqueue(struct workqueue_struct *wq) /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. - * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ spin_lock_irq(&workqueue_lock); if (!wq->nr_drainers++) - wq->flags |= WQ_DRAINING; + wq->flags |= __WQ_DRAINING; spin_unlock_irq(&workqueue_lock); reflush: flush_workqueue(wq); @@ -2795,7 +2795,7 @@ reflush: spin_lock(&workqueue_lock); if (!--wq->nr_drainers) - wq->flags &= ~WQ_DRAINING; + wq->flags &= ~__WQ_DRAINING; spin_unlock(&workqueue_lock); local_irq_enable(); -- cgit v1.2.3 From 8719dceae2f98a578507c0f6b49c93f320bd729c Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:04 -0700 Subject: workqueue: reject adjusting max_active or applying attrs to ordered workqueues Adjusting max_active of or applying new workqueue_attrs to an ordered workqueue breaks its ordering guarantee. The former is obvious. The latter is because applying attrs creates a new pwq (pool_workqueue) and there is no ordering constraint between the old and new pwqs. Make apply_workqueue_attrs() and workqueue_set_max_active() trigger WARN_ON() if those operations are requested on an ordered workqueue and fail / ignore respectively. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- include/linux/workqueue.h | 3 ++- kernel/workqueue.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 1751ec4c47c9..5668ab249af5 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -295,6 +295,7 @@ enum { WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ + __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ @@ -397,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ - alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) + alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 590f4d048ec7..cecd4ffe2c40 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3494,9 +3494,14 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, struct pool_workqueue *pwq, *last_pwq; struct worker_pool *pool; + /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) return -EINVAL; + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) return -ENOMEM; @@ -3752,6 +3757,10 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { struct pool_workqueue *pwq; + /* disallow meddling with max_active for ordered workqueues */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return; + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); spin_lock_irq(&workqueue_lock); -- cgit v1.2.3 From 226223ab3c4118ddd10688cc2c131135848371ab Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 11:30:05 -0700 Subject: workqueue: implement sysfs interface for workqueues There are cases where workqueue users want to expose control knobs to userland. e.g. Unbound workqueues with custom attributes are scheduled to be used for writeback workers and depending on configuration it can be useful to allow admins to tinker with the priority or allowed CPUs. This patch implements workqueue_sysfs_register(), which makes the workqueue visible under /sys/bus/workqueue/devices/WQ_NAME. There currently are two attributes common to both per-cpu and unbound pools and extra attributes for unbound pools including nice level and cpumask. If alloc_workqueue*() is called with WQ_SYSFS, workqueue_sysfs_register() is called automatically as part of workqueue creation. This is the preferred method unless the workqueue user wants to apply workqueue_attrs before making the workqueue visible to userland. v2: Disallow exposing ordered workqueues as ordered workqueues can't be tuned in any way. Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/workqueue.h | 8 ++ kernel/workqueue.c | 288 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 5668ab249af5..7f6d29a417c0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -293,6 +293,7 @@ enum { WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ + WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ @@ -495,4 +496,11 @@ extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ +#ifdef CONFIG_SYSFS +int workqueue_sysfs_register(struct workqueue_struct *wq); +#else /* CONFIG_SYSFS */ +static inline int workqueue_sysfs_register(struct workqueue_struct *wq) +{ return 0; } +#endif /* CONFIG_SYSFS */ + #endif diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cecd4ffe2c40..c82feac0a878 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -210,6 +210,8 @@ struct wq_flusher { struct completion done; /* flush completion */ }; +struct wq_device; + /* * The externally visible workqueue abstraction is an array of * per-CPU workqueues: @@ -233,6 +235,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved pwq max_active */ + +#ifdef CONFIG_SYSFS + struct wq_device *wq_dev; /* I: for sysfs interface */ +#endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif @@ -442,6 +448,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], static DEFINE_IDR(worker_pool_idr); static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) @@ -3153,6 +3161,281 @@ int keventd_up(void) return system_wq != NULL; } +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t wq_per_cpu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} + +static ssize_t wq_max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t wq_max_active_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} + +static struct device_attribute wq_sysfs_attrs[] = { + __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), + __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), + __ATTR_NULL, +}; + +static ssize_t wq_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct worker_pool *pool; + int written; + + rcu_read_lock_sched(); + pool = first_pwq(wq)->pool; + written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + rcu_read_lock_sched(); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + first_pwq(wq)->pool->attrs->nice); + rcu_read_unlock_sched(); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + rcu_read_lock_sched(); + copy_workqueue_attrs(attrs, first_pwq(wq)->pool->attrs); + rcu_read_unlock_sched(); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= -20 && attrs->nice <= 19) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + rcu_read_lock_sched(); + written = cpumask_scnprintf(buf, PAGE_SIZE, + first_pwq(wq)->pool->attrs->cpumask); + rcu_read_unlock_sched(); + + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_attrs = wq_sysfs_attrs, +}; + +static int __init wq_sysfs_init(void) +{ + return subsys_virtual_register(&wq_subsys, NULL); +} +core_initcall(wq_sysfs_init); + +static void wq_device_release(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Returns 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; + + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + /** * free_workqueue_attrs - free a workqueue_attrs * @attrs: workqueue_attrs to free @@ -3625,6 +3908,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, wake_up_process(rescuer->task); } + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; + /* * workqueue_lock protects global freeze state and workqueues * list. Grab it, set max_active accordingly and add the new @@ -3693,6 +3979,8 @@ void destroy_workqueue(struct workqueue_struct *wq) spin_unlock_irq(&workqueue_lock); + workqueue_sysfs_unregister(wq); + if (wq->rescuer) { kthread_stop(wq->rescuer->task); kfree(wq->rescuer); -- cgit v1.2.3 From 92266d6ef60c2381c980c6cdcb2a5c1667b36b49 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Tue, 12 Mar 2013 13:59:13 -0700 Subject: async: simplify lowest_in_progress() The code in lowest_in_progress() are duplicated in two branches, simplify them. tj: Minor indentation adjustment. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Arjan van de Ven <arjan@linux.intel.com> --- kernel/async.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index 8ddee2c3e5b0..ab99c92f6b68 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -84,24 +84,20 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct async_entry *first = NULL; + struct list_head *pending; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) { - if (!list_empty(&domain->pending)) - first = list_first_entry(&domain->pending, - struct async_entry, domain_list); - } else { - if (!list_empty(&async_global_pending)) - first = list_first_entry(&async_global_pending, - struct async_entry, global_list); - } + if (domain) + pending = &domain->pending; + else + pending = &async_global_pending; - if (first) - ret = first->cookie; + if (!list_empty(pending)) + ret = list_first_entry(pending, struct async_entry, + domain_list)->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; -- cgit v1.2.3 From 362f2b098b188ede9c4350cc20e58040dbfa515e Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Tue, 12 Mar 2013 13:59:14 -0700 Subject: async: rename and redefine async_func_ptr A function type is typically defined as typedef ret_type (*func)(args..) but async_func_ptr is not. Redefine it. Also rename async_func_ptr to async_func_t for _func_t suffix is more generic. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Arjan van de Ven <arjan@linux.intel.com> --- arch/sh/drivers/pci/pcie-sh7786.c | 2 +- include/linux/async.h | 6 +++--- kernel/async.c | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c index c2c85f6cd738..a162a7f86b2e 100644 --- a/arch/sh/drivers/pci/pcie-sh7786.c +++ b/arch/sh/drivers/pci/pcie-sh7786.c @@ -35,7 +35,7 @@ static unsigned int nr_ports; static struct sh7786_pcie_hwops { int (*core_init)(void); - async_func_ptr *port_init_hw; + async_func_t port_init_hw; } *sh7786_pcie_hwops; static struct resource sh7786_pci0_resources[] = { diff --git a/include/linux/async.h b/include/linux/async.h index 98ea0fef30d5..6b0226bdaadc 100644 --- a/include/linux/async.h +++ b/include/linux/async.h @@ -16,7 +16,7 @@ #include <linux/list.h> typedef u64 async_cookie_t; -typedef void (async_func_ptr) (void *data, async_cookie_t cookie); +typedef void (*async_func_t) (void *data, async_cookie_t cookie); struct async_domain { struct list_head pending; unsigned registered:1; @@ -37,8 +37,8 @@ struct async_domain { struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \ .registered = 0 } -extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data); -extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, +extern async_cookie_t async_schedule(async_func_t func, void *data); +extern async_cookie_t async_schedule_domain(async_func_t func, void *data, struct async_domain *domain); void async_unregister_domain(struct async_domain *domain); extern void async_synchronize_full(void); diff --git a/kernel/async.c b/kernel/async.c index ab99c92f6b68..61f023ce0228 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -73,7 +73,7 @@ struct async_entry { struct list_head global_list; struct work_struct work; async_cookie_t cookie; - async_func_ptr *func; + async_func_t func; void *data; struct async_domain *domain; }; @@ -145,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) +static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -165,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a spin_unlock_irqrestore(&async_lock, flags); /* low on memory.. run synchronously */ - ptr(data, newcookie); + func(data, newcookie); return newcookie; } INIT_LIST_HEAD(&entry->domain_list); INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = ptr; + entry->func = func; entry->data = data; entry->domain = domain; @@ -198,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a /** * async_schedule - schedule a function for asynchronous execution - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +async_cookie_t async_schedule(async_func_t func, void *data) { - return __async_schedule(ptr, data, &async_dfl_domain); + return __async_schedule(func, data, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule); /** * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * @domain: the domain * @@ -222,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule); * synchronization domain is specified via @domain. Note: This function * may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, +async_cookie_t async_schedule_domain(async_func_t func, void *data, struct async_domain *domain) { - return __async_schedule(ptr, data, domain); + return __async_schedule(func, data, domain); } EXPORT_SYMBOL_GPL(async_schedule_domain); -- cgit v1.2.3 From b5b393601dbce0bce3f0717f29e6c8d1cf0295da Mon Sep 17 00:00:00 2001 From: Jiang Fang <jiang.xx.fang@gmail.com> Date: Sat, 2 Feb 2013 14:13:42 -0800 Subject: rcu: Fix spacing problem Signed-off-by: Jiang Fang <jiang.xx.fang@gmail.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..157539a975df 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -3171,7 +3171,7 @@ void __init rcu_init(void) rcu_init_one(&rcu_bh_state, &rcu_bh_data); __rcu_init_preempt(); rcu_init_nocb(); - open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); /* * We don't need protection against CPU-hotplug here because -- cgit v1.2.3 From b0f740360efec6e6471547c0548f250bc045a233 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Mon, 4 Feb 2013 12:14:24 -0800 Subject: rcu: Avoid invoking RCU core on offline CPUs Offline CPUs transition through the scheduler to the idle loop one last time before being shut down. This can result in RCU raising softirq on this CPU, which is at best useless given that the CPU's callbacks will be offloaded at CPU_DEAD time. This commit therefore avoids raising softirq on offline CPUs. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 157539a975df..b2fc234ba1b9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2169,7 +2169,8 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) static void invoke_rcu_core(void) { - raise_softirq(RCU_SOFTIRQ); + if (cpu_online(smp_processor_id())) + raise_softirq(RCU_SOFTIRQ); } /* -- cgit v1.2.3 From 0bdf5984ad647ba5d1c09ed66a75e5bf609456ba Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> Date: Fri, 8 Feb 2013 16:11:29 -0800 Subject: rcu: Remove comment referring to __stop_machine() Although it used to be that CPU_DYING notifiers executed on the outgoing CPU with interrupts disabled and with all other CPUs spinning, this is no longer the case. This commit therefore removes this obsolete comment. Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b2fc234ba1b9..71df6f9f5ce6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2931,11 +2931,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, break; case CPU_DYING: case CPU_DYING_FROZEN: - /* - * The whole machine is "stopped" except this CPU, so we can - * touch any data without introducing corruption. We send the - * dying CPU's callbacks to an arbitrarily chosen online CPU. - */ for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); rcu_cleanup_after_idle(cpu); -- cgit v1.2.3 From 6f0a6ad2b55053879eee89f812f3527950c380d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Sun, 10 Feb 2013 14:22:14 -0800 Subject: rcu: Delete unused rcu_node "wakemask" field Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index c896b5045d9d..8e756099a1a8 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -134,9 +134,6 @@ struct rcu_node { /* elements that need to drain to allow the */ /* current expedited grace period to */ /* complete (only for TREE_PREEMPT_RCU). */ - atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ - /* Since this has meaning only for leaf */ - /* rcu_node structures, 32 bits suffices. */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ -- cgit v1.2.3 From cfb5966bef85412dab9c93553db10b3e99ac32e8 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 10:28:39 +0800 Subject: cpuset: fix RCU lockdep splat in cpuset_print_task_mems_allowed() Sasha reported a lockdep warning when OOM was triggered. The reason is cgroup_name() should be called with rcu_read_lock() held. Reported-by: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ace5bfcdcb30..efbfca7a33e4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2599,6 +2599,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) struct cgroup *cgrp = task_cs(tsk)->css.cgroup; + rcu_read_lock(); spin_lock(&cpuset_buffer_lock); nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, @@ -2607,6 +2608,7 @@ void cpuset_print_task_mems_allowed(struct task_struct *tsk) tsk->comm, cgroup_name(cgrp), cpuset_nodelist); spin_unlock(&cpuset_buffer_lock); + rcu_read_unlock(); } /* -- cgit v1.2.3 From e7b2dcc52b0e2d598a469f01cc460ccdde6869f2 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 15:35:58 -0700 Subject: cgroup: remove cgroup_is_descendant() It was used by ns cgroup, and ns cgroup was removed long ago. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 3 --- kernel/cgroup.c | 28 ---------------------------- 2 files changed, 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5f76829dd75e..7e818a3ef60a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -448,9 +448,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); int cgroup_task_count(const struct cgroup *cgrp); -/* Return true if cgrp is a descendant of the task's cgroup */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); - /* * Control Group taskset, used to pass around set of tasks to cgroup_subsys * methods. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7a6c4c72ca55..f51443fd5f71 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5035,34 +5035,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) put_css_set_taskexit(cg); } -/** - * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp - * @cgrp: the cgroup in question - * @task: the task in question - * - * See if @cgrp is a descendant of @task's cgroup in the appropriate - * hierarchy. - * - * If we are sending in dummytop, then presumably we are creating - * the top cgroup in the subsystem. - * - * Called only by the ns (nsproxy) cgroup. - */ -int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) -{ - int ret; - struct cgroup *target; - - if (cgrp == dummytop) - return 1; - - target = task_cgroup_from_root(task, cgrp->root); - while (cgrp != target && cgrp!= cgrp->top_cgroup) - cgrp = cgrp->parent; - ret = (cgrp == target); - return ret; -} - static void check_for_release(struct cgroup *cgrp) { /* All of these checks rely on RCU to keep the cgroup -- cgit v1.2.3 From 6dc01181eac16192dc4a5d1b310b78e2e97c003c Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 15:35:58 -0700 Subject: cgroup: remove unused variables in cgroup_destroy_locked() Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f51443fd5f71..fd0b056d8da5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4346,10 +4346,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) { struct dentry *d = cgrp->dentry; struct cgroup *parent = cgrp->parent; - DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - LIST_HEAD(tmp_list); lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); -- cgit v1.2.3 From d7eeac1913ff86a17f891cb4b73f03d4b94907d0 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 15:35:59 -0700 Subject: cgroup: hold cgroup_mutex before calling css_offline() cpuset no longer nests cgroup_mutex inside cpu_hotplug lock, so we don't have to release cgroup_mutex before calling css_offline(). Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- Documentation/cgroups/cgroups.txt | 1 + kernel/cgroup.c | 11 +---------- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index bcf1a00b06a1..0028e888828c 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -580,6 +580,7 @@ propagation along the hierarchy. See the comment on cgroup_for_each_descendant_pre() for details. void css_offline(struct cgroup *cgrp); +(cgroup_mutex held by caller) This is the counterpart of css_online() and called iff css_online() has succeeded on @cgrp. This signifies the beginning of the end of diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fd0b056d8da5..49297cbc134d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4169,17 +4169,8 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) if (!(css->flags & CSS_ONLINE)) return; - /* - * css_offline() should be called with cgroup_mutex unlocked. See - * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for - * details. This temporary unlocking should go away once - * cgroup_mutex is unexported from controllers. - */ - if (ss->css_offline) { - mutex_unlock(&cgroup_mutex); + if (ss->css_offline) ss->css_offline(cgrp); - mutex_lock(&cgroup_mutex); - } cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; } -- cgit v1.2.3 From 6ee211ad0a22869af81eef10845922ac4dcb2d38 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 15:36:00 -0700 Subject: cgroup: don't bother to resize pid array When we open cgroup.procs, we'll allocate an buffer and store all tasks' tgid in it, and then duplicate entries will be stripped. If that results in a much smaller pid list, we'll re-allocate a smaller buffer. But we've already sucessfully allocated memory and reading the procs file is a short period and the memory will be freed very soon, so why bother to re-allocate memory. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 37 +++---------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49297cbc134d..29c77a714731 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3400,35 +3400,14 @@ static void pidlist_free(void *p) else kfree(p); } -static void *pidlist_resize(void *p, int newcount) -{ - void *newlist; - /* note: if new alloc fails, old p will still be valid either way */ - if (is_vmalloc_addr(p)) { - newlist = vmalloc(newcount * sizeof(pid_t)); - if (!newlist) - return NULL; - memcpy(newlist, p, newcount * sizeof(pid_t)); - vfree(p); - } else { - newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); - } - return newlist; -} /* * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries - * If the new stripped list is sufficiently smaller and there's enough memory - * to allocate a new buffer, will let go of the unneeded memory. Returns the - * number of unique elements. + * Returns the number of unique elements. */ -/* is the size difference enough that we should re-allocate the array? */ -#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) -static int pidlist_uniq(pid_t **p, int length) +static int pidlist_uniq(pid_t *list, int length) { int src, dest = 1; - pid_t *list = *p; - pid_t *newlist; /* * we presume the 0th element is unique, so i starts at 1. trivial @@ -3449,16 +3428,6 @@ static int pidlist_uniq(pid_t **p, int length) dest++; } after: - /* - * if the length difference is large enough, we want to allocate a - * smaller buffer to save memory. if this fails due to out of memory, - * we'll just stay with what we've got. - */ - if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { - newlist = pidlist_resize(list, dest); - if (newlist) - *p = newlist; - } return dest; } @@ -3554,7 +3523,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) - length = pidlist_uniq(&array, length); + length = pidlist_uniq(array, length); l = cgroup_pidlist_find(cgrp, type); if (!l) { pidlist_free(array); -- cgit v1.2.3 From 80f36c2a1a612ca419e5b864a7e4808e797d9feb Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 15:36:00 -0700 Subject: cgroup: remove useless code in cgroup_write_event_control() eventfd_poll() never returns POLLHUP. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29c77a714731..c7fe303b5109 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3937,12 +3937,6 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, if (ret) goto fail; - if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { - event->cft->unregister_event(cgrp, event->cft, event->eventfd); - ret = 0; - goto fail; - } - /* * Events should be removed after rmdir of cgroup directory, but before * destroying subsystem state objects. Let's take reference to cgroup -- cgit v1.2.3 From e62676169118bc2d42e5008b3f8872646313f077 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 12 Mar 2013 17:41:37 -0700 Subject: workqueue: implement current_is_workqueue_rescuer() Implement a function which queries whether it currently is running off a workqueue rescuer. This will be used to convert writeback to workqueue. Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/workqueue.h | 1 + kernel/workqueue.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 7f6d29a417c0..df30763c8682 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -451,6 +451,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); +extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c82feac0a878..f5c8bbb9ada3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4071,6 +4071,19 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) } EXPORT_SYMBOL_GPL(workqueue_set_max_active); +/** + * current_is_workqueue_rescuer - is %current workqueue rescuer? + * + * Determine whether %current is a workqueue rescuer. Can be used from + * work functions to determine whether it's being run off the rescuer task. + */ +bool current_is_workqueue_rescuer(void) +{ + struct worker *worker = current_wq_worker(); + + return worker && worker == worker->current_pwq->wq->rescuer; +} + /** * workqueue_congested - test whether a workqueue is congested * @cpu: CPU in question -- cgit v1.2.3 From 20f22ab42e9c832bde6e9a7ed04cdc73ec737e5b Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Mon, 4 Mar 2013 14:32:59 -0800 Subject: signals: fix new kernel-doc warnings Fix new kernel-doc warnings in kernel/signal.c: Warning(kernel/signal.c:2689): No description found for parameter 'uset' Warning(kernel/signal.c:2689): Excess function parameter 'set' description in 'sys_rt_sigpending' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2ec870a4c3c4..d63c79e7e415 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2682,7 +2682,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize) /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked - * @set: stores pending signals + * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) -- cgit v1.2.3 From 6c23cbbd5056b155401b0a2b5567d530e6c750c4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Tue, 5 Mar 2013 10:00:24 -0800 Subject: futex: fix kernel-doc notation and spello Fix kernel-doc warning in futex.c and convert 'Returns' to the new Return: kernel-doc notation format. Warning(kernel/futex.c:2286): Excess function parameter 'clockrt' description in 'futex_wait_requeue_pi' Fix one spello. Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/futex.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f0090a993dab..b26dcfc02c94 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key) * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + * * The key words are stored in *key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), @@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * be "current" except in the case of requeue pi. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; * <0 - error * * The hb->lock and futex_key refs shall be held by the caller. @@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock + * Return: + * 0 - failed to acquire the lock atomically; + * 1 - acquired the lock; * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken; * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must * be paired with exactly one earlier call to queue_me(). * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); * 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) @@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart); * the pi_state owner as well as handle race conditions that may allow us to * acquire the lock. Must be called with the hb lock held. * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; * <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * Return with the hb lock held and a q.key reference on success, and unlocked * with no q.key reference on failure. * - * Returns: - * 0 - uaddr contains val and hb has been locked + * Return: + * 0 - uaddr contains val and hb has been locked; * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2203,9 +2204,9 @@ pi_faulted: * the wakeup and return the appropriate error code to the caller. Must be * called with the hb lock held. * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal @@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * - * Returns: - * 0 - On success + * Return: + * 0 - On success; * <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, -- cgit v1.2.3 From 6231069bdab575fce862ca786f1c0ba5e4e9ba3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Wed, 6 Mar 2013 13:37:09 -0800 Subject: rcu: Add softirq-stall indications to stall-warning messages If RCU's softirq handler is prevented from executing, an RCU CPU stall warning can result. Ways to prevent RCU's softirq handler from executing include: (1) CPU spinning with interrupts disabled, (2) infinite loop in some softirq handler, and (3) in -rt kernels, an infinite loop in a set of real-time threads running at priorities higher than that of RCU's softirq handler. Because this situation can be difficult to track down, this commit causes the count of RCU softirq handler invocations to be printed with RCU CPU stall warnings. This information does require some interpretation, as now documented in Documentation/RCU/stallwarn.txt. Reported-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Paul Gortmaker <paul.gortmaker@windriver.com> --- Documentation/RCU/stallwarn.txt | 33 ++++++++++++++++++++++++--------- kernel/rcutree.h | 5 +++++ kernel/rcutree_plugin.h | 4 +++- 3 files changed, 32 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index 1927151b386b..e38b8df3d727 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -92,14 +92,14 @@ If the CONFIG_RCU_CPU_STALL_INFO kernel configuration parameter is set, more information is printed with the stall-warning message, for example: INFO: rcu_preempt detected stall on CPU - 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 + 0: (63959 ticks this GP) idle=241/3fffffffffffffff/0 softirq=82/543 (t=65000 jiffies) In kernels with CONFIG_RCU_FAST_NO_HZ, even more information is printed: INFO: rcu_preempt detected stall on CPU - 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 drain=0 . timer not pending + 0: (64628 ticks this GP) idle=dd5/3fffffffffffffff/0 softirq=82/543 last_accelerate: a345/d342 nonlazy_posted: 25 .D (t=65000 jiffies) The "(64628 ticks this GP)" indicates that this CPU has taken more @@ -116,13 +116,28 @@ number between the two "/"s is the value of the nesting, which will be a small positive number if in the idle loop and a very large positive number (as shown above) otherwise. -For CONFIG_RCU_FAST_NO_HZ kernels, the "drain=0" indicates that the CPU is -not in the process of trying to force itself into dyntick-idle state, the -"." indicates that the CPU has not given up forcing RCU into dyntick-idle -mode (it would be "H" otherwise), and the "timer not pending" indicates -that the CPU has not recently forced RCU into dyntick-idle mode (it -would otherwise indicate the number of microseconds remaining in this -forced state). +The "softirq=" portion of the message tracks the number of RCU softirq +handlers that the stalled CPU has executed. The number before the "/" +is the number that had executed since boot at the time that this CPU +last noted the beginning of a grace period, which might be the current +(stalled) grace period, or it might be some earlier grace period (for +example, if the CPU might have been in dyntick-idle mode for an extended +time period. The number after the "/" is the number that have executed +since boot until the current time. If this latter number stays constant +across repeated stall-warning messages, it is possible that RCU's softirq +handlers are no longer able to execute on this CPU. This can happen if +the stalled CPU is spinning with interrupts are disabled, or, in -rt +kernels, if a high-priority process is starving RCU's softirq handler. + +For CONFIG_RCU_FAST_NO_HZ kernels, the "last_accelerate:" prints the +low-order 16 bits (in hex) of the jiffies counter when this CPU last +invoked rcu_try_advance_all_cbs() from rcu_needs_cpu() or last invoked +rcu_accelerate_cbs() from rcu_prepare_for_idle(). The "nonlazy_posted:" +prints the number of non-lazy callbacks posted since the last call to +rcu_needs_cpu(). Finally, an "L" indicates that there are currently +no non-lazy callbacks ("." is printed otherwise, as shown above) and +"D" indicates that dyntick-idle processing is enabled ("." is printed +otherwise, for example, if disabled via the "nohz=" kernel boot parameter). Multiple Warnings From One Stall diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 8e756099a1a8..26c9bb3166a3 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -325,6 +325,11 @@ struct rcu_data { struct task_struct *nocb_kthread; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + /* 8) RCU CPU stall data. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned int softirq_snap; /* Snapshot of softirq activity. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + int cpu; struct rcu_state *rsp; }; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c1cc7e17ff9d..7fcd3bbf67da 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2070,10 +2070,11 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) ticks_value = rsp->gpnum - rdp->gpnum; } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); - printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", cpu, ticks_value, ticks_title, atomic_read(&rdtp->dynticks) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), fast_no_hz); } @@ -2087,6 +2088,7 @@ static void print_cpu_stall_info_end(void) static void zero_cpu_stall_ticks(struct rcu_data *rdp) { rdp->ticks_this_gp = 0; + rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); } /* Increment ->ticks_this_gp for all flavors of RCU. */ -- cgit v1.2.3 From 81e59494a56cb14f559886c345c4a93fb576bbab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Sun, 10 Mar 2013 15:44:52 -0700 Subject: rcu: Tone down debugging during boot-up and shutdown. In some situations, randomly delaying RCU grace-period initialization can cause more trouble than help. This commit therefore restricts this type of RCU self-torture to runtime, giving it a rest during boot and shutdown. Reported-by: Sasha Levin <sasha.levin@oracle.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 71df6f9f5ce6..0e522504ae37 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1319,7 +1319,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((random32() % (rcu_num_nodes * 8)) == 0) + if ((random32() % (rcu_num_nodes * 8)) == 0 && + system_state == SYSTEM_RUNNING) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); -- cgit v1.2.3 From 740466bc89ad8bd5afcc8de220f715f62b21e365 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 11:15:19 -0400 Subject: tracing: Fix free of probe entry by calling call_rcu_sched() Because function tracing is very invasive, and can even trace calls to rcu_read_lock(), RCU access in function tracing is done with preempt_disable_notrace(). This requires a synchronize_sched() for updates and not a synchronize_rcu(). Function probes (traceon, traceoff, etc) must be freed after a synchronize_sched() after its entry has been removed from the hash. But call_rcu() is used. Fix this by using call_rcu_sched(). Also fix the usage to use hlist_del_rcu() instead of hlist_del(). Cc: stable@vger.kernel.org Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 98ca94a41819..e6effd0c40a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3108,8 +3108,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); + hlist_del_rcu(&entry->node); + call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); } } __disable_ftrace_function_probe(); -- cgit v1.2.3 From e66eded8309ebf679d3d3c1f5820d1f2ca332c71 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Wed, 13 Mar 2013 11:51:49 -0700 Subject: userns: Don't allow CLONE_NEWUSER | CLONE_FS Don't allowing sharing the root directory with processes in a different user namespace. There doesn't seem to be any point, and to allow it would require the overhead of putting a user namespace reference in fs_struct (for permission checks) and incrementing that reference count on practically every call to fork. So just perform the inexpensive test of forbidding sharing fs_struct acrosss processes in different user namespaces. We already disallow other forms of threading when unsharing a user namespace so this should be no real burden in practice. This updates setns, clone, and unshare to disallow multiple user namespaces sharing an fs_struct. Cc: stable@vger.kernel.org Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/fork.c | 5 ++++- kernel/user_namespace.c | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 8d932b1c9056..1766d324d5e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * If unsharing a user namespace must also unshare the thread. */ if (unshare_flags & CLONE_NEWUSER) - unshare_flags |= CLONE_THREAD; + unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing a pid namespace must also unshare the thread. */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8b650837083e..b14f4d342043 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,6 +21,7 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> +#include <linux/fs_struct.h> static struct kmem_cache *user_ns_cachep __read_mostly; @@ -837,6 +838,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) if (atomic_read(¤t->mm->mm_users) > 1) return -EINVAL; + if (current->fs->users != 1) + return -EINVAL; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; -- cgit v1.2.3 From 2ca39528c01a933f6689cd6505ce65bd6d68a530 Mon Sep 17 00:00:00 2001 From: Kees Cook <keescook@chromium.org> Date: Wed, 13 Mar 2013 14:59:33 -0700 Subject: signal: always clear sa_restorer on execve When the new signal handlers are set up, the location of sa_restorer is not cleared, leaking a parent process's address space location to children. This allows for a potential bypass of the parent's ASLR by examining the sa_restorer value returned when calling sigaction(). Based on what should be considered "secret" about addresses, it only matters across the exec not the fork (since the VMAs haven't changed until the exec). But since exec sets SIG_DFL and keeps sa_restorer, this is where it should be fixed. Given the few uses of sa_restorer, a "set" function was not written since this would be the only use. Instead, we use __ARCH_HAS_SA_RESTORER, as already done in other places. Example of the leak before applying this patch: $ cat /proc/$$/maps ... 7fb9f3083000-7fb9f3238000 r-xp 00000000 fd:01 404469 .../libc-2.15.so ... $ ./leak ... 7f278bc74000-7f278be29000 r-xp 00000000 fd:01 404469 .../libc-2.15.so ... 1 0 (nil) 0x7fb9f30b94a0 2 4000000 (nil) 0x7f278bcaa4a0 3 4000000 (nil) 0x7f278bcaa4a0 4 0 (nil) 0x7fb9f30b94a0 ... [akpm@linux-foundation.org: use SA_RESTORER for backportability] Signed-off-by: Kees Cook <keescook@chromium.org> Reported-by: Emese Revfy <re.emese@gmail.com> Cc: Emese Revfy <re.emese@gmail.com> Cc: PaX Team <pageexec@freemail.hu> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Oleg Nesterov <oleg@redhat.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serge.hallyn@canonical.com> Cc: Julien Tinnes <jln@google.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index d63c79e7e415..43b0d4a1b7ba 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; +#ifdef SA_RESTORER + ka->sa.sa_restorer = NULL; +#endif sigemptyset(&ka->sa.sa_mask); ka++; } -- cgit v1.2.3 From 522cff142d7d2f9230839c9e1f21a4d8bcc22a4a Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Wed, 13 Mar 2013 14:59:34 -0700 Subject: kernel/signal.c: use __ARCH_HAS_SA_RESTORER instead of SA_RESTORER __ARCH_HAS_SA_RESTORER is the preferred conditional for use in 3.9 and later kernels, per Kees. Cc: Emese Revfy <re.emese@gmail.com> Cc: Emese Revfy <re.emese@gmail.com> Cc: PaX Team <pageexec@freemail.hu> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Oleg Nesterov <oleg@redhat.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serge.hallyn@canonical.com> Cc: Julien Tinnes <jln@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 43b0d4a1b7ba..dd72567767d9 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,7 +485,7 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; -#ifdef SA_RESTORER +#ifdef __ARCH_HAS_SA_RESTORER ka->sa.sa_restorer = NULL; #endif sigemptyset(&ka->sa.sa_mask); -- cgit v1.2.3 From e68035fb65dec05718d765fbea14d2e527214ff6 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 14:59:38 -0700 Subject: workqueue: convert to idr_alloc() idr_get_new*() and friends are about to be deprecated. Convert to the new idr_alloc() interface. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/workqueue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811eb..55fac5b991b7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -457,11 +457,12 @@ static int worker_pool_assign_id(struct worker_pool *pool) int ret; mutex_lock(&worker_pool_idr_mutex); - idr_pre_get(&worker_pool_idr, GFP_KERNEL); - ret = idr_get_new(&worker_pool_idr, pool, &pool->id); + ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + if (ret >= 0) + pool->id = ret; mutex_unlock(&worker_pool_idr_mutex); - return ret; + return ret < 0 ? ret : 0; } /* -- cgit v1.2.3 From 0fbd95aa8a056194933fba4ae78c50fc20f0704e Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 16:51:35 -0700 Subject: workqueue: relocate pwq_set_max_active() pwq_set_max_active() is gonna be modified and used during pool_workqueue init. Move it above init_and_link_pwq(). This patch is pure code reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f5c8bbb9ada3..d51928981615 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3725,6 +3725,26 @@ static void pwq_unbound_release_workfn(struct work_struct *work) kfree(wq); } +/** + * pwq_set_max_active - adjust max_active of a pwq + * @pwq: target pool_workqueue + * @max_active: new max_active value. + * + * Set @pwq->max_active to @max_active and activate delayed works if + * increased. + * + * CONTEXT: + * spin_lock_irq(pool->lock). + */ +static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) +{ + pwq->max_active = max_active; + + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); +} + static void init_and_link_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool, @@ -4011,26 +4031,6 @@ void destroy_workqueue(struct workqueue_struct *wq) } EXPORT_SYMBOL_GPL(destroy_workqueue); -/** - * pwq_set_max_active - adjust max_active of a pwq - * @pwq: target pool_workqueue - * @max_active: new max_active value. - * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) -{ - pwq->max_active = max_active; - - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); -} - /** * workqueue_set_max_active - adjust max_active of a workqueue * @wq: target workqueue -- cgit v1.2.3 From 699ce097efe8f45bc5c055e4f12cb1e271c270d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 16:51:35 -0700 Subject: workqueue: implement and use pwq_adjust_max_active() Rename pwq_set_max_active() to pwq_adjust_max_active() and move pool_workqueue->max_active synchronization and max_active determination logic into it. The new function should be called with workqueue_lock held for stable workqueue->saved_max_active, determines the current max_active value the target pool_workqueue should be using from @wq->saved_max_active and the state of the associated pool, and applies it with proper synchronization. The current two users - workqueue_set_max_active() and thaw_workqueues() - are updated accordingly. In addition, the manual freezing handling in __alloc_workqueue_key() and freeze_workqueues_begin() are replaced with calls to pwq_adjust_max_active(). This centralizes max_active handling so that it's less error-prone. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 83 +++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d51928981615..9e2ec4ca415a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3726,23 +3726,38 @@ static void pwq_unbound_release_workfn(struct work_struct *work) } /** - * pwq_set_max_active - adjust max_active of a pwq + * pwq_adjust_max_active - update a pwq's max_active to the current setting * @pwq: target pool_workqueue - * @max_active: new max_active value. - * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. * - * CONTEXT: - * spin_lock_irq(pool->lock). + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) +static void pwq_adjust_max_active(struct pool_workqueue *pwq) { - pwq->max_active = max_active; + struct workqueue_struct *wq = pwq->wq; + bool freezable = wq->flags & WQ_FREEZABLE; + + /* for @wq->saved_max_active */ + lockdep_assert_held(&workqueue_lock); + + /* fast exit for non-freezable wqs */ + if (!freezable && pwq->max_active == wq->saved_max_active) + return; + + spin_lock(&pwq->pool->lock); + + if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + pwq->max_active = wq->saved_max_active; - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); + } else { + pwq->max_active = 0; + } + + spin_unlock(&pwq->pool->lock); } static void init_and_link_pwq(struct pool_workqueue *pwq, @@ -3932,15 +3947,14 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_destroy; /* - * workqueue_lock protects global freeze state and workqueues - * list. Grab it, set max_active accordingly and add the new - * workqueue to workqueues list. + * workqueue_lock protects global freeze state and workqueues list. + * Grab it, adjust max_active and add the new workqueue to + * workqueues list. */ spin_lock_irq(&workqueue_lock); - if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_pwq(pwq, wq) - pwq->max_active = 0; + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); list_add(&wq->list, &workqueues); @@ -4055,17 +4069,8 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) wq->saved_max_active = max_active; - for_each_pwq(pwq, wq) { - struct worker_pool *pool = pwq->pool; - - spin_lock(&pool->lock); - - if (!(wq->flags & WQ_FREEZABLE) || - !(pool->flags & POOL_FREEZING)) - pwq_set_max_active(pwq, max_active); - - spin_unlock(&pool->lock); - } + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); spin_unlock_irq(&workqueue_lock); } @@ -4358,14 +4363,8 @@ void freeze_workqueues_begin(void) /* suppress further executions by setting max_active to zero */ list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - - for_each_pwq(pwq, wq) { - spin_lock(&pwq->pool->lock); - pwq->max_active = 0; - spin_unlock(&pwq->pool->lock); - } + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); } spin_unlock_irq(&workqueue_lock); @@ -4445,14 +4444,8 @@ void thaw_workqueues(void) /* restore max_active and repopulate worklist */ list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - - for_each_pwq(pwq, wq) { - spin_lock(&pwq->pool->lock); - pwq_set_max_active(pwq, wq->saved_max_active); - spin_unlock(&pwq->pool->lock); - } + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); } /* kick workers */ -- cgit v1.2.3 From 983ca25e738ee0c9c5435a503a6bb0034d4552b0 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 16:51:35 -0700 Subject: workqueue: fix max_active handling in init_and_link_pwq() Since 9e8cd2f589 ("workqueue: implement apply_workqueue_attrs()"), init_and_link_pwq() may be called to initialize a new pool_workqueue for a workqueue which is already online, but the function was setting pwq->max_active to wq->saved_max_active without proper synchronization. Fix it by calling pwq_adjust_max_active() under proper locking instead of manually setting max_active. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9e2ec4ca415a..756761480a1a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3771,21 +3771,25 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq->wq = wq; pwq->flush_color = -1; pwq->refcnt = 1; - pwq->max_active = wq->saved_max_active; INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); - /* - * Link @pwq and set the matching work_color. This is synchronized - * with flush_mutex to avoid confusing flush_workqueue(). - */ mutex_lock(&wq->flush_mutex); spin_lock_irq(&workqueue_lock); + /* + * Set the matching work_color. This is synchronized with + * flush_mutex to avoid confusing flush_workqueue(). + */ if (p_last_pwq) *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; + + /* sync max_active to the current setting */ + pwq_adjust_max_active(pwq); + + /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&workqueue_lock); -- cgit v1.2.3 From c5aa87bbf4b23f5e4f167489406daeb0ed275c47 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: workqueue: update comments and a warning message * Update incorrect and add missing synchronization labels. * Update incorrect or misleading comments. Add new comments where clarification is necessary. Reformat / rephrase some comments. * drain_workqueue() can be used separately from destroy_workqueue() but its warning message was incorrectly referring to destruction. Other than the warning message change, this patch doesn't make any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 84 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 756761480a1a..248a1e95b577 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -145,7 +145,7 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - /* workers are chained either in busy_hash or idle_list */ + /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ @@ -154,8 +154,8 @@ struct worker_pool { struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ - struct hlist_node hash_node; /* R: unbound_pool_hash node */ - int refcnt; /* refcnt for unbound pools */ + struct hlist_node hash_node; /* W: unbound_pool_hash node */ + int refcnt; /* W: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -213,8 +213,8 @@ struct wq_flusher { struct wq_device; /* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: + * The externally visible workqueue. It relays the issued work items to + * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { unsigned int flags; /* W: WQ_* flags */ @@ -247,9 +247,10 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -/* hash of all unbound pools keyed by pool->attrs */ +/* W: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); +/* I: attributes used when instantiating standard unbound pools on demand */ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; struct workqueue_struct *system_wq __read_mostly; @@ -434,16 +435,13 @@ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); static bool workqueue_freezing; /* W: have wqs started freezing? */ -/* - * The CPU and unbound standard worker pools. The unbound ones have - * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. - */ +/* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); /* - * idr of all pools. Modifications are protected by workqueue_lock. Read - * accesses are protected by sched-RCU protected. + * R: idr of all pools. Modifications are protected by workqueue_lock. + * Read accesses are protected by sched-RCU protected. */ static DEFINE_IDR(worker_pool_idr); @@ -890,13 +888,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * - * This function checks the work item address, work function and workqueue - * to avoid false positives. Note that this isn't complete as one may - * construct a work function which can introduce dependency onto itself - * through a recycled work item. Well, if somebody wants to shoot oneself - * in the foot that badly, there's only so much we can do, and if such - * deadlock actually occurs, it should be easy to locate the culprit work - * function. + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * spin_lock_irq(pool->lock). @@ -1187,9 +1184,9 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, get_pwq(pwq); /* - * Ensure either worker_sched_deactivated() sees the above - * list_add_tail() or we see zero nr_running to avoid workers - * lying around lazily while there are works to be processed. + * Ensure either wq_worker_sleeping() sees the above + * list_add_tail() or we see zero nr_running to avoid workers lying + * around lazily while there are works to be processed. */ smp_mb(); @@ -1790,6 +1787,10 @@ static struct worker *create_worker(struct worker_pool *pool) if (IS_ERR(worker->task)) goto fail; + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); @@ -1950,8 +1951,8 @@ static void pool_mayday_timeout(unsigned long __pool) * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * - * On return, need_to_create_worker() is guaranteed to be false and - * may_start_working() true. + * On return, need_to_create_worker() is guaranteed to be %false and + * may_start_working() %true. * * LOCKING: * spin_lock_irq(pool->lock) which may be released and regrabbed @@ -1959,7 +1960,7 @@ static void pool_mayday_timeout(unsigned long __pool) * manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_create_worker(struct worker_pool *pool) @@ -2016,7 +2017,7 @@ restart: * multiple times. Called only from manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2268,11 +2269,11 @@ static void process_scheduled_works(struct worker *worker) * worker_thread - the worker thread function * @__worker: self * - * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools - * of these per each cpu. These workers process all works regardless of - * their specific target workqueue. The only exception is works which - * belong to workqueues with a rescuer which will be explained in - * rescuer_thread(). + * The worker thread function. All workers belong to a worker_pool - + * either a per-cpu one or dynamic unbound one. These workers process all + * work items regardless of their specific target workqueue. The only + * exception is work items which belong to workqueues with a rescuer which + * will be explained in rescuer_thread(). */ static int worker_thread(void *__worker) { @@ -2600,11 +2601,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, * flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. + * This function sleeps until all work items which were queued on entry + * have finished execution, but it is not livelocked by new incoming ones. */ void flush_workqueue(struct workqueue_struct *wq) { @@ -2794,7 +2792,7 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); local_irq_enable(); @@ -3576,7 +3574,9 @@ static void rcu_free_pool(struct rcu_head *rcu) * @pool: worker_pool to put * * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU - * safe manner. + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). */ static void put_unbound_pool(struct worker_pool *pool) { @@ -3602,7 +3602,11 @@ static void put_unbound_pool(struct worker_pool *pool) spin_unlock_irq(&workqueue_lock); - /* lock out manager and destroy all workers */ + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * manager_mutex. + */ mutex_lock(&pool->manager_arb); spin_lock_irq(&pool->lock); @@ -4339,7 +4343,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their frozen_works list instead of + * workqueues will queue new works to their delayed_works list instead of * pool->worklist. * * CONTEXT: -- cgit v1.2.3 From 611c92a0203091bb022edec7e2d8b765fe148622 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: workqueue: rename @id to @pi in for_each_each_pool() Rename @id argument of for_each_pool() to @pi so that it doesn't get reused accidentally when for_each_pool() is used in combination with other iterators. This patch is purely cosmetic. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 248a1e95b577..147fc5a784f0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -283,7 +283,7 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor - * @id: integer used for iteration + * @pi: integer used for iteration * * This must be called either with workqueue_lock held or sched RCU read * locked. If the pool needs to be used beyond the locking in effect, the @@ -292,8 +292,8 @@ EXPORT_SYMBOL_GPL(system_freezable_wq); * The if/else clause exists only for the lockdep assertion and can be * ignored. */ -#define for_each_pool(pool, id) \ - idr_for_each_entry(&worker_pool_idr, pool, id) \ +#define for_each_pool(pool, pi) \ + idr_for_each_entry(&worker_pool_idr, pool, pi) \ if (({ assert_rcu_or_wq_lock(); false; })) { } \ else @@ -4354,7 +4354,7 @@ void freeze_workqueues_begin(void) struct worker_pool *pool; struct workqueue_struct *wq; struct pool_workqueue *pwq; - int id; + int pi; spin_lock_irq(&workqueue_lock); @@ -4362,7 +4362,7 @@ void freeze_workqueues_begin(void) workqueue_freezing = true; /* set FREEZING */ - for_each_pool(pool, id) { + for_each_pool(pool, pi) { spin_lock(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; @@ -4435,7 +4435,7 @@ void thaw_workqueues(void) struct workqueue_struct *wq; struct pool_workqueue *pwq; struct worker_pool *pool; - int id; + int pi; spin_lock_irq(&workqueue_lock); @@ -4443,7 +4443,7 @@ void thaw_workqueues(void) goto out_unlock; /* clear FREEZING */ - for_each_pool(pool, id) { + for_each_pool(pool, pi) { spin_lock(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; @@ -4457,7 +4457,7 @@ void thaw_workqueues(void) } /* kick workers */ - for_each_pool(pool, id) { + for_each_pool(pool, pi) { spin_lock(&pool->lock); wake_up_worker(pool); spin_unlock(&pool->lock); -- cgit v1.2.3 From 8425e3d5bdbe8e741d2c73cf3189ed59b4038b84 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 16:51:36 -0700 Subject: workqueue: inline trivial wrappers There's no reason to make these trivial wrappers full (exported) functions. Inline the followings. queue_work() queue_delayed_work() mod_delayed_work() schedule_work_on() schedule_work() schedule_delayed_work_on() schedule_delayed_work() keventd_up() Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/workqueue.h | 123 +++++++++++++++++++++++++++++++++++++++++----- kernel/workqueue.c | 111 ----------------------------------------- 2 files changed, 111 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index df30763c8682..835d12b76960 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -417,28 +417,16 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); -extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); -extern bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); -extern bool mod_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern void flush_scheduled_work(void); -extern bool schedule_work_on(int cpu, struct work_struct *work); -extern bool schedule_work(struct work_struct *work); -extern bool schedule_delayed_work_on(int cpu, struct delayed_work *work, - unsigned long delay); -extern bool schedule_delayed_work(struct delayed_work *work, - unsigned long delay); extern int schedule_on_each_cpu(work_func_t func); -extern int keventd_up(void); int execute_in_process_context(work_func_t fn, struct execute_work *); @@ -455,6 +443,117 @@ extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns %false if @work was already on a queue, %true otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. + */ +static inline bool queue_work(struct workqueue_struct *wq, + struct work_struct *work) +{ + return queue_work_on(WORK_CPU_UNBOUND, wq, work); +} + +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Equivalent to queue_delayed_work_on() but tries to use the local CPU. + */ +static inline bool queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} + +/** + * mod_delayed_work - modify delay of or queue a delayed work + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * mod_delayed_work_on() on local CPU. + */ +static inline bool mod_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); +} + +/** + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +static inline bool schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, system_wq, work); +} + +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * Returns %false if @work was already on the kernel-global workqueue and + * %true otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. + */ +static inline bool schedule_work(struct work_struct *work) +{ + return queue_work(system_wq, work); +} + +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ +static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work_on(cpu, system_wq, dwork, delay); +} + +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +static inline bool schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} + +/** + * keventd_up - is workqueue initialized yet? + */ +static inline bool keventd_up(void) +{ + return system_wq != NULL; +} + /* * Like above, but uses del_timer() instead of del_timer_sync(). This means, * if it returns 0 the timer function may be running and the queueing is in diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 147fc5a784f0..f37421fb4f35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1340,22 +1340,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_work_on); -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns %false if @work was already on a queue, %true otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -bool queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - return queue_work_on(WORK_CPU_UNBOUND, wq, work); -} -EXPORT_SYMBOL_GPL(queue_work); - void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; @@ -1430,21 +1414,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_delayed_work_on); -/** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Equivalent to queue_delayed_work_on() but tries to use the local CPU. - */ -bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - /** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on @@ -1483,21 +1452,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(mod_delayed_work_on); -/** - * mod_delayed_work - modify delay of or queue a delayed work - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * mod_delayed_work_on() on local CPU. - */ -bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, - unsigned long delay) -{ - return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(mod_delayed_work); - /** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state @@ -3001,66 +2955,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) } EXPORT_SYMBOL(cancel_delayed_work_sync); -/** - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -bool schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns %false if @work was already on the kernel-global workqueue and - * %true otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -bool schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - /** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call @@ -3154,11 +3048,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -int keventd_up(void) -{ - return system_wq != NULL; -} - #ifdef CONFIG_SYSFS /* * Workqueues with WQ_SYSFS flag set is visible to userland via -- cgit v1.2.3 From bc3a1afc92aea46d6df18d38e5d15867b17c69f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:39 -0700 Subject: workqueue: rename worker_pool->assoc_mutex to ->manager_mutex Manager operations are currently governed by two mutexes - pool->manager_arb and ->assoc_mutex. The former is used to decide who gets to be the manager and the latter to exclude the actual manager operations including creation and destruction of workers. Anyone who grabs ->manager_arb must perform manager role; otherwise, the pool might stall. Grabbing ->assoc_mutex blocks everyone else from performing manager operations but doesn't require the holder to perform manager duties as it's merely blocking manager operations without becoming the manager. Because the blocking was necessary when [dis]associating per-cpu workqueues during CPU hotplug events, the latter was named assoc_mutex. The mutex is scheduled to be used for other purposes, so this patch gives it a more fitting generic name - manager_mutex - and updates / adds comments to explain synchronization around the manager role and operations. This patch is pure rename / doc update. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 62 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f37421fb4f35..bc25bdfb4b42 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -60,8 +60,8 @@ enum { * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * - * Note that DISASSOCIATED can be flipped only while holding - * assoc_mutex to avoid changing binding state while + * Note that DISASSOCIATED should be flipped only while holding + * manager_mutex to avoid changing binding state while * create_worker() is in progress. */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ @@ -149,8 +149,9 @@ struct worker_pool { DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ + /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ - struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ + struct mutex manager_mutex; /* manager exclusion */ struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ @@ -1635,7 +1636,7 @@ static void rebind_workers(struct worker_pool *pool) struct worker *worker, *n; int i; - lockdep_assert_held(&pool->assoc_mutex); + lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); /* dequeue and kick idle ones */ @@ -2022,31 +2023,44 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; + /* + * Managership is governed by two mutexes - manager_arb and + * manager_mutex. manager_arb handles arbitration of manager role. + * Anyone who successfully grabs manager_arb wins the arbitration + * and becomes the manager. mutex_trylock() on pool->manager_arb + * failure while holding pool->lock reliably indicates that someone + * else is managing the pool and the worker which failed trylock + * can proceed to executing work items. This means that anyone + * grabbing manager_arb is responsible for actually performing + * manager duties. If manager_arb is grabbed and released without + * actual management, the pool may stall indefinitely. + * + * manager_mutex is used for exclusion of actual management + * operations. The holder of manager_mutex can be sure that none + * of management operations, including creation and destruction of + * workers, won't take place until the mutex is released. Because + * manager_mutex doesn't interfere with manager role arbitration, + * it is guaranteed that the pool's management, while may be + * delayed, won't be disturbed by someone else grabbing + * manager_mutex. + */ if (!mutex_trylock(&pool->manager_arb)) return ret; /* - * To simplify both worker management and CPU hotplug, hold off - * management while hotplug is in progress. CPU hotplug path can't - * grab @pool->manager_arb to achieve this because that can lead to - * idle worker depletion (all become busy thinking someone else is - * managing) which in turn can result in deadlock under extreme - * circumstances. Use @pool->assoc_mutex to synchronize manager - * against CPU hotplug. - * - * assoc_mutex would always be free unless CPU hotplug is in - * progress. trylock first without dropping @pool->lock. + * With manager arbitration won, manager_mutex would be free in + * most cases. trylock first without dropping @pool->lock. */ - if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); /* * CPU hotplug could have happened while we were waiting * for assoc_mutex. Hotplug itself can't handle us * because manager isn't either on idle or busy list, and * @pool's state and ours could have deviated. * - * As hotplug is now excluded via assoc_mutex, we can + * As hotplug is now excluded via manager_mutex, we can * simply try to bind. It will succeed or fail depending * on @pool's current state. Try it and adjust * %WORKER_UNBOUND accordingly. @@ -2068,7 +2082,7 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); return ret; } @@ -3436,7 +3450,7 @@ static int init_worker_pool(struct worker_pool *pool) (unsigned long)pool); mutex_init(&pool->manager_arb); - mutex_init(&pool->assoc_mutex); + mutex_init(&pool->manager_mutex); ida_init(&pool->worker_ida); INIT_HLIST_NODE(&pool->hash_node); @@ -4076,11 +4090,11 @@ static void wq_unbind_fn(struct work_struct *work) for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); /* - * We've claimed all manager positions. Make all workers + * We've blocked all manager operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After @@ -4095,7 +4109,7 @@ static void wq_unbind_fn(struct work_struct *work) pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); } /* @@ -4152,14 +4166,14 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: for_each_cpu_worker_pool(pool, cpu) { - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; rebind_workers(pool); spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); } break; } -- cgit v1.2.3 From ebf44d16ec4619c8a8daeacd987dd86d420ea2c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:39 -0700 Subject: workqueue: factor out initial worker creation into create_and_start_worker() get_unbound_pool(), workqueue_cpu_up_callback() and init_workqueues() have similar code pieces to create and start the initial worker factor those out into create_and_start_worker(). This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bc25bdfb4b42..cac710646cbc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1792,6 +1792,26 @@ static void start_worker(struct worker *worker) wake_up_process(worker->task); } +/** + * create_and_start_worker - create and start a worker for a pool + * @pool: the target pool + * + * Create and start a new worker for @pool. + */ +static int create_and_start_worker(struct worker_pool *pool) +{ + struct worker *worker; + + worker = create_worker(pool); + if (worker) { + spin_lock_irq(&pool->lock); + start_worker(worker); + spin_unlock_irq(&pool->lock); + } + + return worker ? 0 : -ENOMEM; +} + /** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed @@ -3542,7 +3562,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) static DEFINE_MUTEX(create_mutex); u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - struct worker *worker; mutex_lock(&create_mutex); @@ -3568,14 +3587,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* create and start the initial worker */ - worker = create_worker(pool); - if (!worker) + if (create_and_start_worker(pool) < 0) goto fail; - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - /* install */ spin_lock_irq(&workqueue_lock); hash_add(unbound_pool_hash, &pool->hash_node, hash); @@ -4148,18 +4162,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: for_each_cpu_worker_pool(pool, cpu) { - struct worker *worker; - if (pool->nr_workers) continue; - - worker = create_worker(pool); - if (!worker) + if (create_and_start_worker(pool) < 0) return NOTIFY_BAD; - - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); } break; @@ -4409,15 +4415,8 @@ static int __init init_workqueues(void) struct worker_pool *pool; for_each_cpu_worker_pool(pool, cpu) { - struct worker *worker; - pool->flags &= ~POOL_DISASSOCIATED; - - worker = create_worker(pool); - BUG_ON(!worker); - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); + BUG_ON(create_and_start_worker(pool) < 0); } } -- cgit v1.2.3 From cd549687a7ee5e619a26f55af4059c4ae585811c Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:39 -0700 Subject: workqueue: better define locking rules around worker creation / destruction When a manager creates or destroys workers, the operations are always done with the manager_mutex held; however, initial worker creation or worker destruction during pool release don't grab the mutex. They are still correct as initial worker creation doesn't require synchronization and grabbing manager_arb provides enough exclusion for pool release path. Still, let's make everyone follow the same rules for consistency and such that lockdep annotations can be added. Update create_and_start_worker() and put_unbound_pool() to grab manager_mutex around thread creation and destruction respectively and add lockdep assertions to create_worker() and destroy_worker(). This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cac710646cbc..ce1ab069c5fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1715,6 +1715,8 @@ static struct worker *create_worker(struct worker_pool *pool) struct worker *worker = NULL; int id = -1; + lockdep_assert_held(&pool->manager_mutex); + spin_lock_irq(&pool->lock); while (ida_get_new(&pool->worker_ida, &id)) { spin_unlock_irq(&pool->lock); @@ -1796,12 +1798,14 @@ static void start_worker(struct worker *worker) * create_and_start_worker - create and start a worker for a pool * @pool: the target pool * - * Create and start a new worker for @pool. + * Grab the managership of @pool and create and start a new worker for it. */ static int create_and_start_worker(struct worker_pool *pool) { struct worker *worker; + mutex_lock(&pool->manager_mutex); + worker = create_worker(pool); if (worker) { spin_lock_irq(&pool->lock); @@ -1809,6 +1813,8 @@ static int create_and_start_worker(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } + mutex_unlock(&pool->manager_mutex); + return worker ? 0 : -ENOMEM; } @@ -1826,6 +1832,9 @@ static void destroy_worker(struct worker *worker) struct worker_pool *pool = worker->pool; int id = worker->id; + lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->lock); + /* sanity check frenzy */ if (WARN_ON(worker->current_work) || WARN_ON(!list_empty(&worker->scheduled))) @@ -3531,6 +3540,7 @@ static void put_unbound_pool(struct worker_pool *pool) * manager_mutex. */ mutex_lock(&pool->manager_arb); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); while ((worker = first_worker(pool))) @@ -3538,6 +3548,7 @@ static void put_unbound_pool(struct worker_pool *pool) WARN_ON(pool->nr_workers || pool->nr_idle); spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->manager_mutex); mutex_unlock(&pool->manager_arb); /* shut down the timers */ -- cgit v1.2.3 From 7d19c5ce6682fd0390049b5340d4b6bb6065d677 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: workqueue: relocate global variable defs and function decls in workqueue.c They're split across debugobj code for some reason. Collect them. This patch is pure relocation. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ce1ab069c5fe..9a0cbb2fdd64 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -248,6 +248,21 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; +/* Serializes the accesses to the list of workqueues. */ +static DEFINE_SPINLOCK(workqueue_lock); +static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ + +/* the per-cpu worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); + +/* + * R: idr of all pools. Modifications are protected by workqueue_lock. + * Read accesses are protected by sched-RCU protected. + */ +static DEFINE_IDR(worker_pool_idr); + /* W: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); @@ -265,6 +280,10 @@ EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); + #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> @@ -431,25 +450,6 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ - -/* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); - -/* - * R: idr of all pools. Modifications are protected by workqueue_lock. - * Read accesses are protected by sched-RCU protected. - */ -static DEFINE_IDR(worker_pool_idr); - -static int worker_thread(void *__worker); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from); - /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { -- cgit v1.2.3 From 5bcab3355a555a9c1bd4becb136cbd3651c8eafa Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: workqueue: separate out pool and workqueue locking into wq_mutex Currently, workqueue_lock protects most shared workqueue resources - the pools, workqueues, pool_workqueues, draining, ID assignments, mayday handling and so on. The coverage has grown organically and there is no identified bottleneck coming from workqueue_lock, but it has grown a bit too much and scheduled rebinding changes need the pools and workqueues to be protected by a mutex instead of a spinlock. This patch breaks out pool and workqueue synchronization from workqueue_lock into a new mutex - wq_mutex. The followings are protected by wq_mutex. * worker_pool_idr and unbound_pool_hash * pool->refcnt * workqueues list * workqueue->flags, ->nr_drainers Most changes are mostly straight-forward. workqueue_lock is replaced with wq_mutex where applicable and workqueue_lock lock/unlocks are added where wq_mutex conversion leaves data structures not protected by wq_mutex without locking. irq / preemption flippings were added where the conversion affects them. Things worth noting are * New WQ and WR locking lables added along with assert_rcu_or_wq_mutex(). * worker_pool_assign_id() now expects to be called under wq_mutex. * create_mutex is removed from get_unbound_pool(). It now just holds wq_mutex. This patch shouldn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 146 ++++++++++++++++++++++++++++------------------------- 1 file changed, 77 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9a0cbb2fdd64..c3b59ff22007 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -119,9 +119,11 @@ enum { * * F: wq->flush_mutex protected. * - * W: workqueue_lock protected. + * WQ: wq_mutex protected. + * + * WR: wq_mutex protected for writes. Sched-RCU protected for reads. * - * R: workqueue_lock protected for writes. Sched-RCU protected for reads. + * W: workqueue_lock protected. * * FR: wq->flush_mutex and workqueue_lock protected for writes. Sched-RCU * protected for reads. @@ -155,8 +157,8 @@ struct worker_pool { struct ida worker_ida; /* L: for worker IDs */ struct workqueue_attrs *attrs; /* I: worker attributes */ - struct hlist_node hash_node; /* W: unbound_pool_hash node */ - int refcnt; /* W: refcnt for unbound pools */ + struct hlist_node hash_node; /* WQ: unbound_pool_hash node */ + int refcnt; /* WQ: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -218,10 +220,10 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* W: WQ_* flags */ + unsigned int flags; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* FR: all pwqs of this wq */ - struct list_head list; /* W: list of all workqueues */ + struct list_head list; /* WQ: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ int work_color; /* F: current work color */ @@ -234,7 +236,7 @@ struct workqueue_struct { struct list_head maydays; /* W: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* W: drain in progress */ + int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* W: saved pwq max_active */ #ifdef CONFIG_SYSFS @@ -248,22 +250,19 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -/* Serializes the accesses to the list of workqueues. */ +static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ + +static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ +static bool workqueue_freezing; /* WQ: have wqs started freezing? */ /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); -/* - * R: idr of all pools. Modifications are protected by workqueue_lock. - * Read accesses are protected by sched-RCU protected. - */ -static DEFINE_IDR(worker_pool_idr); +static DEFINE_IDR(worker_pool_idr); /* WR: idr of all pools */ -/* W: hash of all unbound pools keyed by pool->attrs */ +/* WQ: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ @@ -287,6 +286,11 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> +#define assert_rcu_or_wq_mutex() \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq_mutex), \ + "sched RCU or wq_mutex should be held") + #define assert_rcu_or_wq_lock() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ lockdep_is_held(&workqueue_lock), \ @@ -305,16 +309,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with workqueue_lock held or sched RCU read - * locked. If the pool needs to be used beyond the locking in effect, the - * caller is responsible for guaranteeing that the pool stays online. + * This must be called either with wq_mutex held or sched RCU read locked. + * If the pool needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ - if (({ assert_rcu_or_wq_lock(); false; })) { } \ + if (({ assert_rcu_or_wq_mutex(); false; })) { } \ else /** @@ -455,13 +459,12 @@ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; + lockdep_assert_held(&wq_mutex); + do { if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) return -ENOMEM; - - spin_lock_irq(&workqueue_lock); ret = idr_get_new(&worker_pool_idr, pool, &pool->id); - spin_unlock_irq(&workqueue_lock); } while (ret == -EAGAIN); return ret; @@ -574,9 +577,9 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * * Return the worker_pool @work was last associated with. %NULL if none. * - * Pools are created and destroyed under workqueue_lock, and allows read - * access under sched-RCU read lock. As such, this function should be - * called under workqueue_lock or with preemption disabled. + * Pools are created and destroyed under wq_mutex, and allows read access + * under sched-RCU read lock. As such, this function should be called + * under wq_mutex or with preemption disabled. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -588,7 +591,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); int pool_id; - assert_rcu_or_wq_lock(); + assert_rcu_or_wq_mutex(); if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) @@ -2768,10 +2771,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); reflush: flush_workqueue(wq); @@ -2796,12 +2799,12 @@ reflush: goto reflush; } - spin_lock(&workqueue_lock); + local_irq_enable(); + + mutex_lock(&wq_mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; - spin_unlock(&workqueue_lock); - - local_irq_enable(); + mutex_unlock(&wq_mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3514,16 +3517,16 @@ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); if (--pool->refcnt) { - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return; } /* sanity checks */ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || WARN_ON(!list_empty(&pool->worklist))) { - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return; } @@ -3532,7 +3535,7 @@ static void put_unbound_pool(struct worker_pool *pool) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); /* * Become the manager and destroy all workers. Grabbing @@ -3570,21 +3573,18 @@ static void put_unbound_pool(struct worker_pool *pool) */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { - static DEFINE_MUTEX(create_mutex); u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - mutex_lock(&create_mutex); + mutex_lock(&wq_mutex); /* do we already have a matching pool? */ - spin_lock_irq(&workqueue_lock); hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { if (wqattrs_equal(pool->attrs, attrs)) { pool->refcnt++; goto out_unlock; } } - spin_unlock_irq(&workqueue_lock); /* nope, create a new one */ pool = kzalloc(sizeof(*pool), GFP_KERNEL); @@ -3602,14 +3602,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) goto fail; /* install */ - spin_lock_irq(&workqueue_lock); hash_add(unbound_pool_hash, &pool->hash_node, hash); out_unlock: - spin_unlock_irq(&workqueue_lock); - mutex_unlock(&create_mutex); + mutex_unlock(&wq_mutex); return pool; fail: - mutex_unlock(&create_mutex); + mutex_unlock(&wq_mutex); if (pool) put_unbound_pool(pool); return NULL; @@ -3883,18 +3881,19 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_destroy; /* - * workqueue_lock protects global freeze state and workqueues list. - * Grab it, adjust max_active and add the new workqueue to - * workqueues list. + * wq_mutex protects global freeze state and workqueues list. Grab + * it, adjust max_active and add the new @wq to workqueues list. */ - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); + spin_lock_irq(&workqueue_lock); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); + spin_unlock_irq(&workqueue_lock); list_add(&wq->list, &workqueues); - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return wq; @@ -3920,9 +3919,8 @@ void destroy_workqueue(struct workqueue_struct *wq) /* drain it before proceeding with destruction */ drain_workqueue(wq); - spin_lock_irq(&workqueue_lock); - /* sanity checks */ + spin_lock_irq(&workqueue_lock); for_each_pwq(pwq, wq) { int i; @@ -3940,14 +3938,15 @@ void destroy_workqueue(struct workqueue_struct *wq) return; } } + spin_unlock_irq(&workqueue_lock); /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ + mutex_lock(&wq_mutex); list_del_init(&wq->list); - - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); workqueue_sysfs_unregister(wq); @@ -4267,7 +4266,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4276,26 +4275,28 @@ void freeze_workqueues_begin(void) struct pool_workqueue *pwq; int pi; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; /* set FREEZING */ for_each_pool(pool, pi) { - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); WARN_ON_ONCE(pool->flags & POOL_FREEZING); pool->flags |= POOL_FREEZING; - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } /* suppress further executions by setting max_active to zero */ + spin_lock_irq(&workqueue_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } - spin_unlock_irq(&workqueue_lock); + + mutex_unlock(&wq_mutex); } /** @@ -4305,7 +4306,7 @@ void freeze_workqueues_begin(void) * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: - * Grabs and releases workqueue_lock. + * Grabs and releases wq_mutex. * * RETURNS: * %true if some freezable workqueues are still busy. %false if freezing @@ -4317,7 +4318,7 @@ bool freeze_workqueues_busy(void) struct workqueue_struct *wq; struct pool_workqueue *pwq; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); WARN_ON_ONCE(!workqueue_freezing); @@ -4328,16 +4329,19 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ + preempt_disable(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; + preempt_enable(); goto out_unlock; } } + preempt_enable(); } out_unlock: - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); return busy; } @@ -4348,7 +4352,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -4357,35 +4361,37 @@ void thaw_workqueues(void) struct worker_pool *pool; int pi; - spin_lock_irq(&workqueue_lock); + mutex_lock(&wq_mutex); if (!workqueue_freezing) goto out_unlock; /* clear FREEZING */ for_each_pool(pool, pi) { - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); pool->flags &= ~POOL_FREEZING; - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } /* restore max_active and repopulate worklist */ + spin_lock_irq(&workqueue_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } + spin_unlock_irq(&workqueue_lock); /* kick workers */ for_each_pool(pool, pi) { - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); wake_up_worker(pool); - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } workqueue_freezing = false; out_unlock: - spin_unlock_irq(&workqueue_lock); + mutex_unlock(&wq_mutex); } #endif /* CONFIG_FREEZER */ @@ -4417,7 +4423,9 @@ static int __init init_workqueues(void) pool->attrs->nice = std_nice[i++]; /* alloc pool ID */ + mutex_lock(&wq_mutex); BUG_ON(worker_pool_assign_id(pool)); + mutex_unlock(&wq_mutex); } } -- cgit v1.2.3 From 794b18bc8a3f80445e1f85c9c87c74de9575c93a Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: workqueue: separate out pool_workqueue locking into pwq_lock This patch continues locking cleanup from the previous patch. It breaks out pool_workqueue synchronization from workqueue_lock into a new spinlock - pwq_lock. The followings are protected by pwq_lock. * workqueue->pwqs * workqueue->saved_max_active The conversion is straight-forward. workqueue_lock usages which cover the above two are converted to pwq_lock. New locking label PW added for things protected by pwq_lock and FR is updated to mean flush_mutex + pwq_lock + sched-RCU. This patch shouldn't introduce any visible behavior changes. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 69 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c3b59ff22007..63856dfbd082 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -123,9 +123,11 @@ enum { * * WR: wq_mutex protected for writes. Sched-RCU protected for reads. * + * PW: pwq_lock protected. + * * W: workqueue_lock protected. * - * FR: wq->flush_mutex and workqueue_lock protected for writes. Sched-RCU + * FR: wq->flush_mutex and pwq_lock protected for writes. Sched-RCU * protected for reads. */ @@ -198,7 +200,7 @@ struct pool_workqueue { * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue * itself is also sched-RCU protected so that the first pwq can be - * determined without grabbing workqueue_lock. + * determined without grabbing pwq_lock. */ struct work_struct unbound_release_work; struct rcu_head rcu; @@ -237,7 +239,7 @@ struct workqueue_struct { struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* W: saved pwq max_active */ + int saved_max_active; /* PW: saved pwq max_active */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -251,6 +253,7 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ +static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ static DEFINE_SPINLOCK(workqueue_lock); static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ @@ -291,10 +294,10 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq_mutex), \ "sched RCU or wq_mutex should be held") -#define assert_rcu_or_wq_lock() \ +#define assert_rcu_or_pwq_lock() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&workqueue_lock), \ - "sched RCU or workqueue lock should be held") + lockdep_is_held(&pwq_lock), \ + "sched RCU or pwq_lock should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ @@ -326,16 +329,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with workqueue_lock held or sched RCU read - * locked. If the pwq needs to be used beyond the locking in effect, the - * caller is responsible for guaranteeing that the pwq stays online. + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ - if (({ assert_rcu_or_wq_lock(); false; })) { } \ + if (({ assert_rcu_or_pwq_lock(); false; })) { } \ else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -474,13 +477,13 @@ static int worker_pool_assign_id(struct worker_pool *pool) * first_pwq - return the first pool_workqueue of the specified workqueue * @wq: the target workqueue * - * This must be called either with workqueue_lock held or sched RCU read - * locked. If the pwq needs to be used beyond the locking in effect, the - * caller is responsible for guaranteeing that the pwq stays online. + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. */ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - assert_rcu_or_wq_lock(); + assert_rcu_or_pwq_lock(); return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, pwqs_node); } @@ -3639,9 +3642,9 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * and consistent with the linking path. */ mutex_lock(&wq->flush_mutex); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); list_del_rcu(&pwq->pwqs_node); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->flush_mutex); put_unbound_pool(pool); @@ -3669,7 +3672,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) bool freezable = wq->flags & WQ_FREEZABLE; /* for @wq->saved_max_active */ - lockdep_assert_held(&workqueue_lock); + lockdep_assert_held(&pwq_lock); /* fast exit for non-freezable wqs */ if (!freezable && pwq->max_active == wq->saved_max_active) @@ -3706,7 +3709,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); mutex_lock(&wq->flush_mutex); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); /* * Set the matching work_color. This is synchronized with @@ -3722,7 +3725,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->flush_mutex); } @@ -3886,10 +3889,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, */ mutex_lock(&wq_mutex); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); list_add(&wq->list, &workqueues); @@ -3920,13 +3923,13 @@ void destroy_workqueue(struct workqueue_struct *wq) drain_workqueue(wq); /* sanity checks */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); for_each_pwq(pwq, wq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); return; } } @@ -3934,11 +3937,11 @@ void destroy_workqueue(struct workqueue_struct *wq) if (WARN_ON(pwq->refcnt > 1) || WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); return; } } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); /* * wq list is used to freeze wq, remove from list after @@ -4000,14 +4003,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); wq->saved_max_active = max_active; for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); @@ -4266,7 +4269,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, pwq_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4289,12 +4292,12 @@ void freeze_workqueues_begin(void) } /* suppress further executions by setting max_active to zero */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); mutex_unlock(&wq_mutex); } @@ -4352,7 +4355,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases wq_mutex, workqueue_lock and pool->lock's. + * Grabs and releases wq_mutex, pwq_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -4375,12 +4378,12 @@ void thaw_workqueues(void) } /* restore max_active and repopulate worklist */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&pwq_lock); /* kick workers */ for_each_pool(pool, pi) { -- cgit v1.2.3 From 2e109a2855bf6cf675a8b74dbd89b6492e8def42 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Wed, 13 Mar 2013 19:47:40 -0700 Subject: workqueue: rename workqueue_lock to wq_mayday_lock With the recent locking updates, the only thing protected by workqueue_lock is workqueue->maydays list. Rename workqueue_lock to wq_mayday_lock. This patch is pure rename. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 63856dfbd082..969be0b72071 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,10 +125,10 @@ enum { * * PW: pwq_lock protected. * - * W: workqueue_lock protected. - * * FR: wq->flush_mutex and pwq_lock protected for writes. Sched-RCU * protected for reads. + * + * MD: wq_mayday_lock protected. */ /* struct worker is defined in workqueue_internal.h */ @@ -194,7 +194,7 @@ struct pool_workqueue { int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ struct list_head pwqs_node; /* FR: node on wq->pwqs */ - struct list_head mayday_node; /* W: node on wq->maydays */ + struct list_head mayday_node; /* MD: node on wq->maydays */ /* * Release of unbound pwq is punted to system_wq. See put_pwq() @@ -235,7 +235,7 @@ struct workqueue_struct { struct list_head flusher_queue; /* F: flush waiters */ struct list_head flusher_overflow; /* F: flush overflow list */ - struct list_head maydays; /* W: pwqs requesting rescue */ + struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ @@ -254,7 +254,7 @@ static struct kmem_cache *pwq_cache; static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ -static DEFINE_SPINLOCK(workqueue_lock); +static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ static bool workqueue_freezing; /* WQ: have wqs started freezing? */ @@ -1894,7 +1894,7 @@ static void send_mayday(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; - lockdep_assert_held(&workqueue_lock); + lockdep_assert_held(&wq_mayday_lock); if (!wq->rescuer) return; @@ -1911,7 +1911,7 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&workqueue_lock); /* for wq->maydays */ + spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ spin_lock(&pool->lock); if (need_to_create_worker(pool)) { @@ -1926,7 +1926,7 @@ static void pool_mayday_timeout(unsigned long __pool) } spin_unlock(&pool->lock); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&wq_mayday_lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -2404,7 +2404,7 @@ repeat: } /* see whether any pwq is asking for help */ - spin_lock_irq(&workqueue_lock); + spin_lock_irq(&wq_mayday_lock); while (!list_empty(&wq->maydays)) { struct pool_workqueue *pwq = list_first_entry(&wq->maydays, @@ -2415,7 +2415,7 @@ repeat: __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&wq_mayday_lock); /* migrate to the target cpu if possible */ worker_maybe_bind_and_lock(pool); @@ -2442,10 +2442,10 @@ repeat: rescuer->pool = NULL; spin_unlock(&pool->lock); - spin_lock(&workqueue_lock); + spin_lock(&wq_mayday_lock); } - spin_unlock_irq(&workqueue_lock); + spin_unlock_irq(&wq_mayday_lock); /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); -- cgit v1.2.3 From b66a2356d7108a15b8b5c9b8e6213e05ead22cd6 Mon Sep 17 00:00:00 2001 From: anish kumar <anish198519851985@gmail.com> Date: Tue, 12 Mar 2013 14:44:08 -0400 Subject: watchdog: Add comments to explain the watchdog_disabled variable The watchdog_disabled flag is a bit cryptic. However it's usefulness is multifold. Uses are: 1. Check if smpboot_register_percpu_thread function passed. 2. Makes sure that user enables and disables the watchdog in sequence i.e. enable watchdog->disable watchdog->enable watchdog Unlike enable watchdog->enable watchdog which is wrong. Signed-off-by: anish kumar <anish198519851985@gmail.com> [small text cleanups] Signed-off-by: Don Zickus <dzickus@redhat.com> Cc: chuansheng.liu@intel.com Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1363113848-18344-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/watchdog.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4a944676358e..05039e348f07 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -517,6 +517,11 @@ int proc_dowatchdog(struct ctl_table *table, int write, return ret; set_sample_period(); + /* + * Watchdog threads shouldn't be enabled if they are + * disabled. The 'watchdog_disabled' variable check in + * watchdog_*_all_cpus() function takes care of this. + */ if (watchdog_enabled && watchdog_thresh) watchdog_enable_all_cpus(); else -- cgit v1.2.3 From 69d34da2984c95b33ea21518227e1f9470f11d95 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 14 Mar 2013 13:50:56 -0400 Subject: tracing: Protect tracer flags with trace_types_lock Seems that the tracer flags have never been protected from synchronous writes. Luckily, admins don't usually modify the tracing flags via two different tasks. But if scripts were to be used to modify them, then they could get corrupted. Move the trace_types_lock that protects against tracers changing to also protect the flags being set. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 53df2839bb93..00daf5f8c50b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2916,6 +2916,8 @@ static int trace_set_options(char *option) cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { set_tracer_flags(1 << i, !neg); @@ -2924,11 +2926,10 @@ static int trace_set_options(char *option) } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); + if (!trace_options[i]) ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - } + + mutex_unlock(&trace_types_lock); return ret; } @@ -4781,7 +4782,10 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; + + mutex_lock(&trace_types_lock); set_tracer_flags(1 << index, val); + mutex_unlock(&trace_types_lock); *ppos += cnt; -- cgit v1.2.3 From 80902822658aab18330569587cdb69ac1dfdcea8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 14 Mar 2013 14:20:54 -0400 Subject: tracing: Keep overwrite in sync between regular and snapshot buffers Changing the overwrite mode for the ring buffer via the trace option only sets the normal buffer. But the snapshot buffer could swap with it, and then the snapshot would be in non overwrite mode and the normal buffer would be in overwrite mode, even though the option flag states otherwise. Keep the two buffers overwrite modes in sync. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 00daf5f8c50b..02debabe9ed4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2895,8 +2895,12 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(global_trace.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(max_tr.buffer, enabled); +#endif + } if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); -- cgit v1.2.3 From 613f04a0f51e6e68ac6fe571ab79da3c0a5eb4da Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 14 Mar 2013 15:03:53 -0400 Subject: tracing: Prevent buffer overwrite disabled for latency tracers The latency tracers require the buffers to be in overwrite mode, otherwise they get screwed up. Force the buffers to stay in overwrite mode when latency tracers are enabled. Added a flag_changed() method to the tracer structure to allow the tracers to see what flags are being changed, and also be able to prevent the change from happing. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 38 ++++++++++++++++++++++++++++++++------ kernel/trace/trace.h | 6 ++++++ kernel/trace/trace_irqsoff.c | 19 ++++++++++++++----- kernel/trace/trace_sched_wakeup.c | 18 +++++++++++++----- 4 files changed, 65 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 02debabe9ed4..4f1dade56981 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2881,11 +2881,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return -EINVAL; } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (current_trace->flag_changed) + if (current_trace->flag_changed(current_trace, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@ -2904,13 +2918,15 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); + + return 0; } static int trace_set_options(char *option) { char *cmp; int neg = 0; - int ret = 0; + int ret = -ENODEV; int i; cmp = strstrip(option); @@ -2924,7 +2940,7 @@ static int trace_set_options(char *option) for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(1 << i, !neg); break; } } @@ -2943,6 +2959,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2952,7 +2969,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - trace_set_options(buf); + ret = trace_set_options(buf); + if (ret < 0) + return ret; *ppos += cnt; @@ -3256,6 +3275,9 @@ static int tracing_set_tracer(const char *buf) goto out; trace_branch_disable(); + + current_trace->enabled = false; + if (current_trace->reset) current_trace->reset(tr); @@ -3300,6 +3322,7 @@ static int tracing_set_tracer(const char *buf) } current_trace = t; + current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -4788,9 +4811,12 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; mutex_lock(&trace_types_lock); - set_tracer_flags(1 << index, val); + ret = set_tracer_flag(1 << index, val); mutex_unlock(&trace_types_lock); + if (ret < 0) + return ret; + *ppos += cnt; return cnt; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d56..2081971367ea 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -283,11 +283,15 @@ struct tracer { enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct tracer *tracer, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; bool print_max; bool use_max_tr; bool allocated_snapshot; + bool enabled; }; @@ -943,6 +947,8 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac4881..443b25b43b4f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,7 @@ enum { static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -558,8 +558,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) static void __irqsoff_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; @@ -573,10 +576,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr) static void irqsoff_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_irqsoff_tracer(tr, is_graph()); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +615,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -642,6 +649,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -677,6 +685,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a1..fde652c9a511 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr); static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; #define TRACE_DISPLAY_GRAPH 1 @@ -540,8 +540,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int __wakeup_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -563,12 +566,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +600,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -615,6 +622,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = trace_keep_overwrite, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif -- cgit v1.2.3 From ae63b31e4d0e2ec09c569306ea46f664508ef717 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Thu, 3 May 2012 23:09:03 -0400 Subject: tracing: Separate out trace events from global variables The trace events for ftrace are all defined via global variables. The arrays of events and event systems are linked to a global list. This prevents multiple users of the event system (what to enable and what not to). By adding descriptors to represent the event/file relation, as well as to which trace_array descriptor they are associated with, allows for more than one set of events to be defined. Once the trace events files have a link between the trace event and the trace_array they are associated with, we can create multiple trace_arrays that can record separate events in separate buffers. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace_event.h | 51 ++- include/trace/ftrace.h | 3 +- kernel/trace/trace.c | 8 + kernel/trace/trace.h | 39 +- kernel/trace/trace_events.c | 776 +++++++++++++++++++++++++------------ kernel/trace/trace_events_filter.c | 5 +- 6 files changed, 622 insertions(+), 260 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 13a54d0bdfa8..c7191d482f98 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -182,18 +182,20 @@ extern int ftrace_event_reg(struct ftrace_event_call *event, enum trace_reg type, void *data); enum { - TRACE_EVENT_FL_ENABLED_BIT, TRACE_EVENT_FL_FILTERED_BIT, - TRACE_EVENT_FL_RECORDED_CMD_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, }; +/* + * Event flags: + * FILTERED - The event has a filter attached + * CAP_ANY - Any user can enable for perf + * NO_SET_FILTER - Set when filter has error and is to be ignored + */ enum { - TRACE_EVENT_FL_ENABLED = (1 << TRACE_EVENT_FL_ENABLED_BIT), TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), - TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), @@ -203,12 +205,44 @@ struct ftrace_event_call { struct list_head list; struct ftrace_event_class *class; char *name; - struct dentry *dir; struct trace_event event; const char *print_fmt; struct event_filter *filter; + struct list_head *files; void *mod; void *data; + int flags; /* static flags of different events */ + +#ifdef CONFIG_PERF_EVENTS + int perf_refcount; + struct hlist_head __percpu *perf_events; +#endif +}; + +struct trace_array; +struct ftrace_subsystem_dir; + +enum { + FTRACE_EVENT_FL_ENABLED_BIT, + FTRACE_EVENT_FL_RECORDED_CMD_BIT, +}; + +/* + * Ftrace event file flags: + * ENABELD - The event is enabled + * RECORDED_CMD - The comms should be recorded at sched_switch + */ +enum { + FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), + FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), +}; + +struct ftrace_event_file { + struct list_head list; + struct ftrace_event_call *event_call; + struct dentry *dir; + struct trace_array *tr; + struct ftrace_subsystem_dir *system; /* * 32 bit flags: @@ -223,17 +257,12 @@ struct ftrace_event_call { * * Note: Reads of flags do not hold the event_mutex since * they occur in critical sections. But the way flags - * is currently used, these changes do no affect the code + * is currently used, these changes do not affect the code * except that when a change is made, it may have a slight * delay in propagating the changes to other CPUs due to * caching and such. */ unsigned int flags; - -#ifdef CONFIG_PERF_EVENTS - int perf_refcount; - struct hlist_head __percpu *perf_events; -#endif }; #define __TRACE_EVENT_FLAGS(name, value) \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 40dc5e8fe340..191d9661e277 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -518,7 +518,8 @@ static inline notrace int ftrace_get_offsets_##call( \ static notrace void \ ftrace_raw_event_##call(void *__data, proto) \ { \ - struct ftrace_event_call *event_call = __data; \ + struct ftrace_event_file *ftrace_file = __data; \ + struct ftrace_event_call *event_call = ftrace_file->event_call; \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade56981..932931897b8d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -189,6 +189,8 @@ unsigned long long ns2usecs(cycle_t nsec) */ static struct trace_array global_trace; +LIST_HEAD(ftrace_trace_arrays); + static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); int filter_current_check_discard(struct ring_buffer *buffer, @@ -5359,6 +5361,12 @@ __init static int tracer_alloc_buffers(void) register_die_notifier(&trace_die_notifier); + global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + + INIT_LIST_HEAD(&global_trace.systems); + INIT_LIST_HEAD(&global_trace.events); + list_add(&global_trace.list, &ftrace_trace_arrays); + while (trace_boot_options) { char *option; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2081971367ea..037f7eb03d69 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -158,13 +158,39 @@ struct trace_array_cpu { */ struct trace_array { struct ring_buffer *buffer; + struct list_head list; int cpu; int buffer_disabled; + unsigned int flags; cycle_t time_start; + struct dentry *dir; + struct dentry *event_dir; + struct list_head systems; + struct list_head events; struct task_struct *waiter; struct trace_array_cpu *data[NR_CPUS]; }; +enum { + TRACE_ARRAY_FL_GLOBAL = (1 << 0) +}; + +extern struct list_head ftrace_trace_arrays; + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ + struct trace_array *tr; + + tr = list_entry(ftrace_trace_arrays.prev, + typeof(*tr), list); + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + return tr; +} + #define FTRACE_CMP_TYPE(var, type) \ __builtin_types_compatible_p(typeof(var), type *) @@ -851,12 +877,19 @@ struct event_filter { struct event_subsystem { struct list_head list; const char *name; - struct dentry *entry; struct event_filter *filter; - int nr_events; int ref_count; }; +struct ftrace_subsystem_dir { + struct list_head list; + struct event_subsystem *subsystem; + struct trace_array *tr; + struct dentry *entry; + int ref_count; + int nr_events; +}; + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) @@ -914,7 +947,7 @@ extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250c..439955239bae 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -36,6 +36,19 @@ EXPORT_SYMBOL_GPL(event_storage); LIST_HEAD(ftrace_events); LIST_HEAD(ftrace_common_fields); +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + struct ftrace_event_file *___n; \ + list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file() \ + } + struct list_head * trace_get_fields(struct ftrace_event_call *event_call) { @@ -149,15 +162,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->name, call->class->probe, - call); + file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->name, call->class->probe, - call); + file); return 0; #ifdef CONFIG_PERF_EVENTS @@ -183,54 +198,57 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg); void trace_event_enable_cmd_record(bool enable) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + do_for_each_event_file(tr, file) { + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + file->flags |= FTRACE_EVENT_FL_RECORDED_CMD; } else { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD; } - } + } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, - int enable) +static int ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable) { + struct ftrace_event_call *call = file->event_call; int ret = 0; switch (enable) { case 0: - if (call->flags & TRACE_EVENT_FL_ENABLED) { - call->flags &= ~TRACE_EVENT_FL_ENABLED; - if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) { + file->flags &= ~FTRACE_EVENT_FL_ENABLED; + if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD; } - call->class->reg(call, TRACE_REG_UNREGISTER, NULL); + call->class->reg(call, TRACE_REG_UNREGISTER, file); } break; case 1: - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { if (trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + file->flags |= FTRACE_EVENT_FL_RECORDED_CMD; } - ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " "%s\n", call->name); break; } - call->flags |= TRACE_EVENT_FL_ENABLED; + file->flags |= FTRACE_EVENT_FL_ENABLED; } break; } @@ -238,13 +256,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, return ret; } -static void ftrace_clear_events(void) +static void ftrace_clear_events(struct trace_array *tr) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - ftrace_event_enable_disable(call, 0); + list_for_each_entry(file, &tr->events, list) { + ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } @@ -257,6 +275,8 @@ static void __put_system(struct event_subsystem *system) if (--system->ref_count) return; + list_del(&system->list); + if (filter) { kfree(filter->filter_string); kfree(filter); @@ -271,24 +291,45 @@ static void __get_system(struct event_subsystem *system) system->ref_count++; } -static void put_system(struct event_subsystem *system) +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + dir->ref_count++; + __get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + /* If the subsystem is about to be freed, the dir must be too */ + WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + + __put_system(dir->subsystem); + if (!--dir->ref_count) + kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir) { mutex_lock(&event_mutex); - __put_system(system); + __put_system_dir(dir); mutex_unlock(&event_mutex); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(const char *match, const char *sub, - const char *event, int set) +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { + struct ftrace_event_file *file; struct ftrace_event_call *call; int ret = -EINVAL; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; if (!call->name || !call->class || !call->class->reg) continue; @@ -307,7 +348,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, if (event && strcmp(event, call->name) != 0) continue; - ftrace_event_enable_disable(call, set); + ftrace_event_enable_disable(file, set); ret = 0; } @@ -316,7 +357,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, return ret; } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; @@ -344,7 +385,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(match, sub, event, set); + return __ftrace_set_clr_event(tr, match, sub, event, set); } /** @@ -361,7 +402,9 @@ static int ftrace_set_clr_event(char *buf, int set) */ int trace_set_clr_event(const char *system, const char *event, int set) { - return __ftrace_set_clr_event(NULL, system, event, set); + struct trace_array *tr = top_trace_array(); + + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -373,6 +416,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) @@ -395,7 +440,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(parser.buffer + !set, set); + ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } @@ -411,17 +456,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { + list_for_each_entry_continue(file, &tr->events, list) { + call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg) - return call; + return file; } return NULL; @@ -429,30 +477,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = t_next(m, call, &l); - if (!call) + file = t_next(m, file, &l); + if (!file) break; } - return call; + return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->flags & TRACE_EVENT_FL_ENABLED) - return call; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return file; } return NULL; @@ -460,23 +510,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = s_next(m, call, &l); - if (!call) + file = s_next(m, file, &l); + if (!file) break; } - return call; + return file; } static int t_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); @@ -494,10 +546,10 @@ static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file = filp->private_data; char *buf; - if (call->flags & TRACE_EVENT_FL_ENABLED) + if (file->flags & FTRACE_EVENT_FL_ENABLED) buf = "1\n"; else buf = "0\n"; @@ -509,10 +561,13 @@ static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file = filp->private_data; unsigned long val; int ret; + if (!file) + return -EINVAL; + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -525,7 +580,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; @@ -543,14 +598,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (!call->name || !call->class || !call->class->reg) continue; @@ -562,7 +621,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); + set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -584,7 +643,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; @@ -607,7 +667,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; @@ -845,43 +905,75 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; + struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_array *tr; int ret; - if (!inode->i_private) - goto skip_search; - /* Make sure the system still exists */ mutex_lock(&event_mutex); - list_for_each_entry(system, &event_subsystems, list) { - if (system == inode->i_private) { - /* Don't open systems with no events */ - if (!system->nr_events) { - system = NULL; - break; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + list_for_each_entry(dir, &tr->systems, list) { + if (dir == inode->i_private) { + /* Don't open systems with no events */ + if (dir->nr_events) { + __get_system_dir(dir); + system = dir->subsystem; + } + goto exit_loop; } - __get_system(system); - break; } } + exit_loop: mutex_unlock(&event_mutex); - if (system != inode->i_private) + if (!system) return -ENODEV; - skip_search: + /* Some versions of gcc think dir can be uninitialized here */ + WARN_ON(!dir); + ret = tracing_open_generic(inode, filp); - if (ret < 0 && system) - put_system(system); + if (ret < 0) + put_system(dir); + + return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ + struct ftrace_subsystem_dir *dir; + struct trace_array *tr = inode->i_private; + int ret; + + /* Make a temporary dir that has no system but points to tr */ + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + return -ENOMEM; + + dir->tr = tr; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + kfree(dir); + + filp->private_data = dir; return ret; } static int subsystem_release(struct inode *inode, struct file *file) { - struct event_subsystem *system = inode->i_private; + struct ftrace_subsystem_dir *dir = file->private_data; - if (system) - put_system(system); + /* + * If dir->subsystem is NULL, then this is a temporary + * descriptor that was made for a trace_array to enable + * all subsystems. + */ + if (dir->subsystem) + put_system(dir); + else + kfree(dir); return 0; } @@ -890,7 +982,8 @@ static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; @@ -915,7 +1008,7 @@ static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; char *buf; int err; @@ -932,7 +1025,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_subsystem_event_filter(system, buf); + err = apply_subsystem_event_filter(dir, buf); free_page((unsigned long) buf); if (err < 0) return err; @@ -1041,30 +1134,35 @@ static const struct file_operations ftrace_system_enable_fops = { .release = subsystem_release, }; +static const struct file_operations ftrace_tr_enable_fops = { + .open = system_tr_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, .llseek = default_llseek, }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) { - static struct dentry *d_tracer; - static struct dentry *d_events; - - if (d_events) - return d_events; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return NULL; + struct seq_file *m; + int ret; - d_events = debugfs_create_dir("events", d_tracer); - if (!d_events) - pr_warning("Could not create debugfs " - "'events' directory\n"); + ret = seq_open(file, seq_ops); + if (ret < 0) + return ret; + m = file->private_data; + /* copy tr over to seq ops */ + m->private = inode->i_private; - return d_events; + return ret; } static int @@ -1072,117 +1170,169 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; - return seq_open(file, seq_ops); + return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; + struct trace_array *tr = inode->i_private; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_clear_events(); + ftrace_clear_events(tr); - return seq_open(file, seq_ops); + return ftrace_event_open(inode, file, seq_ops); +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ + struct event_subsystem *system; + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) + return NULL; + + system->ref_count = 1; + system->name = kstrdup(name, GFP_KERNEL); + + if (!system->name) + goto out_free; + + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) + goto out_free; + + list_add(&system->list, &event_subsystems); + + return system; + + out_free: + kfree(system->name); + kfree(system); + return NULL; } static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, + struct ftrace_event_file *file, struct dentry *parent) { + struct ftrace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + system = dir->subsystem; if (strcmp(system->name, name) == 0) { - system->nr_events++; - return system->entry; + dir->nr_events++; + file->system = dir; + return dir->entry; } } - /* need to create new entry */ - system = kmalloc(sizeof(*system), GFP_KERNEL); - if (!system) { - pr_warning("No memory to create event subsystem %s\n", - name); - return d_events; + /* Now see if the system itself exists. */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) + break; } + /* Reset system variable when not found */ + if (&system->list == &event_subsystems) + system = NULL; - system->entry = debugfs_create_dir(name, d_events); - if (!system->entry) { - pr_warning("Could not create event subsystem %s\n", - name); - kfree(system); - return d_events; - } + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + goto out_fail; - system->nr_events = 1; - system->ref_count = 1; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) { - debugfs_remove(system->entry); - kfree(system); - return d_events; + if (!system) { + system = create_new_subsystem(name); + if (!system) + goto out_free; + } else + __get_system(system); + + dir->entry = debugfs_create_dir(name, parent); + if (!dir->entry) { + pr_warning("Failed to create system directory %s\n", name); + __put_system(system); + goto out_free; } - list_add(&system->list, &event_subsystems); - - system->filter = NULL; - - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); - if (!system->filter) { - pr_warning("Could not allocate filter for subsystem " - "'%s'\n", name); - return system->entry; - } + dir->tr = tr; + dir->ref_count = 1; + dir->nr_events = 1; + dir->subsystem = system; + file->system = dir; - entry = debugfs_create_file("filter", 0644, system->entry, system, + entry = debugfs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); + pr_warning("Could not create debugfs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, system, + trace_create_file("enable", 0644, dir->entry, dir, &ftrace_system_enable_fops); - return system->entry; + list_add(&dir->list, &tr->systems); + + return dir->entry; + + out_free: + kfree(dir); + out_fail: + /* Only print this message if failed on memory allocation */ + if (!dir || !system) + pr_warning("No memory to create event subsystem %s\n", + name); + return NULL; } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +event_create_dir(struct dentry *parent, + struct ftrace_event_file *file, const struct file_operations *id, const struct file_operations *enable, const struct file_operations *filter, const struct file_operations *format) { + struct ftrace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; struct list_head *head; + struct dentry *d_events; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->class->system, d_events); - - call->dir = debugfs_create_dir(call->name, d_events); - if (!call->dir) { - pr_warning("Could not create debugfs " - "'%s' directory\n", call->name); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { + d_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!d_events) + return -ENOMEM; + } else + d_events = parent; + + file->dir = debugfs_create_dir(call->name, d_events); + if (!file->dir) { + pr_warning("Could not create debugfs '%s' directory\n", + call->name); return -1; } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, call->dir, call, + trace_create_file("enable", 0644, file->dir, file, enable); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, call->dir, call, + trace_create_file("id", 0444, file->dir, call, id); #endif @@ -1196,23 +1346,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); - return ret; + return -1; } } - trace_create_file("filter", 0644, call->dir, call, + trace_create_file("filter", 0644, file->dir, call, filter); - trace_create_file("format", 0444, call->dir, call, + trace_create_file("format", 0444, file->dir, call, format); return 0; } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + debugfs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ + struct ftrace_event_file *file; + struct trace_array *tr; + + do_for_each_event_file_safe(tr, file) { + + if (file->event_call != call) + continue; + + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kfree(file); + + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); +} + static void event_remove(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); + struct trace_array *tr; + struct ftrace_event_file *file; + + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + ftrace_event_enable_disable(file, 0); + /* + * The do_for_each_event_file() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + if (call->event.funcs) __unregister_ftrace_event(&call->event); + remove_event_from_tracers(call); list_del(&call->list); } @@ -1234,61 +1437,58 @@ static int event_init(struct ftrace_event_call *call) } static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +__register_event(struct ftrace_event_call *call, struct module *mod) { - struct dentry *d_events; int ret; ret = event_init(call); if (ret < 0) return ret; - d_events = event_trace_events_dir(); - if (!d_events) - return -ENOENT; - - ret = event_create_dir(call, d_events, id, enable, filter, format); - if (!ret) - list_add(&call->list, &ftrace_events); + list_add(&call->list, &ftrace_events); call->mod = mod; - return ret; + return 0; } +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) +{ + struct ftrace_event_file *file; + + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) + return -ENOMEM; + + file->event_call = call; + file->tr = tr; + list_add(&file->list, &tr->events); + + return event_create_dir(tr->event_dir, file, id, enable, filter, format); +} + +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call, + struct ftrace_module_file_ops *file_ops); + /* Add an additional event_call dynamically */ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - mutex_unlock(&event_mutex); - return ret; -} -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; + ret = __register_event(call, NULL); + if (ret >= 0) + __add_event_to_tracers(call, NULL); - if (strcmp(name, TRACE_SYSTEM) == 0) - return; - - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - debugfs_remove_recursive(system->entry); - list_del(&system->list); - __put_system(system); - } - break; - } - } + mutex_unlock(&event_mutex); + return ret; } /* @@ -1299,8 +1499,6 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) event_remove(call); trace_destroy_fields(call); destroy_preds(call); - debugfs_remove_recursive(call->dir); - remove_subsystem_dir(call->class->system); } /* Remove an event_call */ @@ -1335,6 +1533,17 @@ struct ftrace_module_file_ops { struct file_operations filter; }; +static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + return file_ops; + } + return NULL; +} + static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { @@ -1386,9 +1595,8 @@ static void trace_module_add_events(struct module *mod) return; for_each_event(call, start, end) { - __trace_add_event_call(*call, mod, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + __register_event(*call, mod); + __add_event_to_tracers(*call, file_ops); } } @@ -1444,6 +1652,10 @@ static int trace_module_notify(struct notifier_block *self, return 0; } #else +static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod) +{ + return NULL; +} static int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -1451,6 +1663,72 @@ static int trace_module_notify(struct notifier_block *self, } #endif /* CONFIG_MODULES */ +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_module_file_ops *file_ops = NULL; + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + if (call->mod) { + /* + * Directories for events by modules need to + * keep module ref counts when opened (as we don't + * want the module to disappear when reading one + * of these files). The file_ops keep account of + * the module ref count. + * + * As event_calls are added in groups by module, + * when we find one file_ops, we don't need to search for + * each call in that module, as the rest should be the + * same. Only search for a new one if the last one did + * not match. + */ + if (!file_ops || call->mod != file_ops->mod) + file_ops = find_ftrace_file_ops(call->mod); + if (!file_ops) + continue; /* Warn? */ + ret = __trace_add_new_event(call, tr, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + call->name); + continue; + } + ret = __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + call->name); + } +} + +static void +__add_event_to_tracers(struct ftrace_event_call *call, + struct ftrace_module_file_ops *file_ops) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (file_ops) + __trace_add_new_event(call, tr, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + else + __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + } +} + static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, @@ -1471,8 +1749,43 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + struct dentry *d_events; + struct dentry *entry; + + entry = debugfs_create_file("set_event", 0644, parent, + tr, &ftrace_set_event_fops); + if (!entry) { + pr_warning("Could not create debugfs 'set_event' entry\n"); + return -ENOMEM; + } + + d_events = debugfs_create_dir("events", parent); + if (!d_events) + pr_warning("Could not create debugfs 'events' directory\n"); + + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + + tr->event_dir = d_events; + __trace_add_event_dirs(tr); + + return 0; +} + static __init int event_trace_enable(void) { + struct trace_array *tr = top_trace_array(); struct ftrace_event_call **iter, *call; char *buf = bootup_event_buf; char *token; @@ -1494,7 +1807,7 @@ static __init int event_trace_enable(void) if (!*token) continue; - ret = ftrace_set_clr_event(token, 1); + ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } @@ -1506,61 +1819,29 @@ static __init int event_trace_enable(void) static __init int event_trace_init(void) { - struct ftrace_event_call *call; + struct trace_array *tr; struct dentry *d_tracer; struct dentry *entry; - struct dentry *d_events; int ret; + tr = top_trace_array(); + d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, - NULL, &ftrace_avail_fops); + tr, &ftrace_avail_fops); if (!entry) pr_warning("Could not create debugfs " "'available_events' entry\n"); - entry = debugfs_create_file("set_event", 0644, d_tracer, - NULL, &ftrace_set_event_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_event' entry\n"); - - d_events = event_trace_events_dir(); - if (!d_events) - return 0; - - /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - trace_create_file("enable", 0644, d_events, - NULL, &ftrace_system_enable_fops); - if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - /* - * Early initialization already enabled ftrace event. - * Now it's only necessary to create the event directory. - */ - list_for_each_entry(call, &ftrace_events, list) { - - ret = event_create_dir(call, d_events, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - if (ret < 0) - event_remove(call); - } + ret = event_trace_add_tracer(d_tracer, tr); + if (ret) + return ret; ret = register_module_notifier(&trace_module_nb); if (ret) @@ -1627,13 +1908,20 @@ static __init void event_test_stuff(void) */ static __init void event_trace_self_tests(void) { + struct ftrace_subsystem_dir *dir; + struct ftrace_event_file *file; struct ftrace_event_call *call; struct event_subsystem *system; + struct trace_array *tr; int ret; + tr = top_trace_array(); + pr_info("Running tests on trace events:\n"); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) @@ -1657,15 +1945,15 @@ static __init void event_trace_self_tests(void) * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->flags & TRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } - ftrace_event_enable_disable(call, 1); + ftrace_event_enable_disable(file, 1); event_test_stuff(); - ftrace_event_enable_disable(call, 0); + ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } @@ -1674,7 +1962,9 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on trace event systems:\n"); - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + + system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) @@ -1682,7 +1972,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1691,7 +1981,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); @@ -1706,7 +1996,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling all events\n"); return; @@ -1715,7 +2005,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4d..2a22a177ab44 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1907,16 +1907,17 @@ out_unlock: return err; } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { + struct event_subsystem *system = dir->subsystem; struct event_filter *filter; int err = 0; mutex_lock(&event_mutex); /* Make sure the system still has events */ - if (!system->nr_events) { + if (!dir->nr_events) { err = -ENODEV; goto out_unlock; } -- cgit v1.2.3 From ae3b5093ad6004b52e2825f3db1ad8200a2724d8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Wed, 23 Jan 2013 15:22:59 -0500 Subject: tracing: Use RING_BUFFER_ALL_CPUS for TRACE_PIPE_ALL_CPU Both RING_BUFFER_ALL_CPUS and TRACE_PIPE_ALL_CPU are defined as -1 and used to say that all the ring buffers are to be modified or read (instead of just a single cpu, which would be >= 0). There's no reason to keep TRACE_PIPE_ALL_CPU as it is also started to be used for more than what it was created for, and now that the ring buffer code added a generic RING_BUFFER_ALL_CPUS define, we can clean up the trace code to use that instead and remove the TRACE_PIPE_ALL_CPU macro. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 28 ++++++++++++++-------------- kernel/trace/trace.h | 2 -- kernel/trace/trace_kdb.c | 4 ++-- 3 files changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 932931897b8d..59953aa28845 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -287,13 +287,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ - /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ @@ -303,7 +303,7 @@ static inline void trace_access_lock(int cpu) static inline void trace_access_unlock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -1823,7 +1823,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ - if (cpu_file > TRACE_PIPE_ALL_CPU) { + if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1983,7 +1983,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else @@ -2291,7 +2291,7 @@ int trace_empty(struct trace_iterator *iter) int cpu; /* If we are looking at one CPU buffer, only check that one */ - if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { @@ -2533,7 +2533,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->snapshot) tracing_stop(); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = ring_buffer_read_prepare(iter->tr->buffer, cpu); @@ -2617,7 +2617,7 @@ static int tracing_open(struct inode *inode, struct file *file) (file->f_flags & O_TRUNC)) { long cpu = (long) inode->i_private; - if (cpu == TRACE_PIPE_ALL_CPU) + if (cpu == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&global_trace); else tracing_reset(&global_trace, cpu); @@ -5035,7 +5035,7 @@ static __init int tracer_init_debugfs(void) NULL, &tracing_cpumask_fops); trace_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + (void *) RING_BUFFER_ALL_CPUS, &tracing_fops); trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); @@ -5055,7 +5055,7 @@ static __init int tracer_init_debugfs(void) NULL, &tracing_readme_fops); trace_create_file("trace_pipe", 0444, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); + (void *) RING_BUFFER_ALL_CPUS, &tracing_pipe_fops); trace_create_file("buffer_size_kb", 0644, d_tracer, (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); @@ -5085,7 +5085,7 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); + (void *) RING_BUFFER_ALL_CPUS, &snapshot_fops); #endif create_trace_options_dir(); @@ -5162,7 +5162,7 @@ void trace_init_global_iter(struct trace_iterator *iter) { iter->tr = &global_trace; iter->trace = current_trace; - iter->cpu_file = TRACE_PIPE_ALL_CPU; + iter->cpu_file = RING_BUFFER_ALL_CPUS; } static void @@ -5210,7 +5210,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) switch (oops_dump_mode) { case DUMP_ALL: - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; break; case DUMP_ORIG: iter.cpu_file = raw_smp_processor_id(); @@ -5219,7 +5219,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) goto out_enable; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; } printk(KERN_TRACE "Dumping ftrace buffer:\n"); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 037f7eb03d69..da09a037abcd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -453,8 +453,6 @@ static __always_inline void trace_clear_recursion(int bit) current->trace_recursion = val; } -#define TRACE_PIPE_ALL_CPU -1 - static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b3..cc1dbdc5ee5d 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -43,7 +43,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = ring_buffer_read_prepare(iter.tr->buffer, cpu); @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv) !cpu_online(cpu_file)) return KDB_BADINT; } else { - cpu_file = TRACE_PIPE_ALL_CPU; + cpu_file = RING_BUFFER_ALL_CPUS; } kdb_trap_printk++; -- cgit v1.2.3 From 2b6080f28c7cc3efc8625ab71495aae89aeb63a0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Fri, 11 May 2012 13:29:49 -0400 Subject: tracing: Encapsulate global_trace and remove dependencies on global vars The global_trace variable in kernel/trace/trace.c has been kept 'static' and local to that file so that it would not be used too much outside of that file. This has paid off, even though there were lots of changes to make the trace_array structure more generic (not depending on global_trace). Removal of a lot of direct usages of global_trace is needed to be able to create more trace_arrays such that we can add multiple buffers. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 561 ++++++++++++++++++++++---------------- kernel/trace/trace.h | 21 +- kernel/trace/trace_irqsoff.c | 8 +- kernel/trace/trace_sched_wakeup.c | 8 +- 4 files changed, 358 insertions(+), 240 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 59953aa28845..91fe40905828 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@ /* * ring buffer based function tracer * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: @@ -251,9 +251,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly = &nop_trace; - /* * trace_types_lock is used to protect the trace_types list. */ @@ -350,9 +347,6 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - /** * trace_wake_up - wake up tasks waiting for trace input * @@ -708,14 +702,14 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct ring_buffer *buf; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->allocated_snapshot) { + if (!tr->current_trace->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(current_trace != &nop_trace); + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } @@ -742,11 +736,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) + if (WARN_ON_ONCE(!tr->current_trace->allocated_snapshot)) return; arch_spin_lock(&ftrace_max_lock); @@ -853,8 +847,8 @@ int register_tracer(struct tracer *type) #ifdef CONFIG_FTRACE_STARTUP_TEST if (type->selftest && !tracing_selftest_disabled) { - struct tracer *saved_tracer = current_trace; struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; /* * Run a selftest on this tracer. @@ -865,7 +859,7 @@ int register_tracer(struct tracer *type) */ tracing_reset_online_cpus(tr); - current_trace = type; + tr->current_trace = type; if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ @@ -879,7 +873,7 @@ int register_tracer(struct tracer *type) pr_info("Testing tracer %s: ", type->name); ret = type->selftest(type, tr); /* the test is responsible for resetting too */ - current_trace = saved_tracer; + tr->current_trace = saved_tracer; if (ret) { printk(KERN_CONT "FAILED!\n"); /* Add the warning after printing 'FAILED' */ @@ -997,7 +991,7 @@ static void trace_init_cmdlines(void) int is_tracing_stopped(void) { - return trace_stop_count; + return global_trace.stop_count; } /** @@ -1029,12 +1023,12 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) { - if (trace_stop_count < 0) { + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (--global_trace.stop_count) { + if (global_trace.stop_count < 0) { /* Someone screwed up their debugging */ WARN_ON_ONCE(1); - trace_stop_count = 0; + global_trace.stop_count = 0; } goto out; } @@ -1054,7 +1048,38 @@ void tracing_start(void) ftrace_start(); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + /* If global, we need to also start the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_start(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + + if (--tr->stop_count) { + if (tr->stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + tr->stop_count = 0; + } + goto out; + } + + buffer = tr->buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -1069,8 +1094,8 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (trace_stop_count++) + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (global_trace.stop_count++) goto out; /* Prevent the buffers from switching */ @@ -1087,7 +1112,28 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + /* If global, we need to also stop the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_stop(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) + goto out; + + buffer = tr->buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -1956,6 +2002,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; @@ -1968,8 +2015,8 @@ static void *s_start(struct seq_file *m, loff_t *pos) * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); if (iter->snapshot && iter->trace->use_max_tr) @@ -2099,7 +2146,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_array *tr = iter->tr; struct trace_array_cpu *data = tr->data[tr->cpu]; - struct tracer *type = current_trace; + struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = "preemption"; @@ -2478,7 +2525,8 @@ static const struct seq_operations tracer_seq_ops = { static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { - long cpu_file = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int cpu; @@ -2503,19 +2551,20 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->trace) goto fail; - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace->print_max || snapshot) + /* Currently only the top directory has a snapshot */ + if (tr->current_trace->print_max || snapshot) iter->tr = &max_tr; else - iter->tr = &global_trace; + iter->tr = tr; iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); - iter->cpu_file = cpu_file; + iter->cpu_file = tc->cpu; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) @@ -2531,7 +2580,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) /* stop the trace while dumping if we are not opening "snapshot" */ if (!iter->snapshot) - tracing_stop(); + tracing_stop_tr(tr); if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { @@ -2578,6 +2627,7 @@ static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct trace_iterator *iter; + struct trace_array *tr; int cpu; if (!(file->f_mode & FMODE_READ)) @@ -2585,6 +2635,12 @@ static int tracing_release(struct inode *inode, struct file *file) iter = m->private; + /* Only the global tracer has a matching max_tr */ + if (iter->tr == &max_tr) + tr = &global_trace; + else + tr = iter->tr; + mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) @@ -2596,7 +2652,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (!iter->snapshot) /* reenable tracing if it was previously enabled */ - tracing_start(); + tracing_start_tr(tr); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2615,12 +2671,13 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - long cpu = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; - if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&global_trace); + if (tc->cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(tr); else - tracing_reset(&global_trace, cpu); + tracing_reset(tr, tc->cpu); } if (file->f_mode & FMODE_READ) { @@ -2767,8 +2824,9 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - int err, cpu; + struct trace_array *tr = filp->private_data; cpumask_var_t tracing_cpumask_new; + int err, cpu; if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -2788,13 +2846,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, */ if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&global_trace.data[cpu]->disabled); - ring_buffer_record_disable_cpu(global_trace.buffer, cpu); + atomic_inc(&tr->data[cpu]->disabled); + ring_buffer_record_disable_cpu(tr->buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&global_trace.data[cpu]->disabled); - ring_buffer_record_enable_cpu(global_trace.buffer, cpu); + atomic_dec(&tr->data[cpu]->disabled); + ring_buffer_record_enable_cpu(tr->buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); @@ -2823,12 +2881,13 @@ static const struct file_operations tracing_cpumask_fops = { static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; + struct trace_array *tr = m->private; u32 tracer_flags; int i; mutex_lock(&trace_types_lock); - tracer_flags = current_trace->flags->val; - trace_opts = current_trace->flags->opts; + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) @@ -2892,15 +2951,15 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) return 0; } -int set_tracer_flag(unsigned int mask, int enabled) +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) return 0; /* Give the tracer a chance to approve the change */ - if (current_trace->flag_changed) - if (current_trace->flag_changed(current_trace, mask, !!enabled)) + if (tr->current_trace->flag_changed) + if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) return -EINVAL; if (enabled) @@ -2924,7 +2983,7 @@ int set_tracer_flag(unsigned int mask, int enabled) return 0; } -static int trace_set_options(char *option) +static int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; @@ -2942,14 +3001,14 @@ static int trace_set_options(char *option) for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - ret = set_tracer_flag(1 << i, !neg); + ret = set_tracer_flag(tr, 1 << i, !neg); break; } } /* If no option could be set, test the specific tracer options */ if (!trace_options[i]) - ret = set_tracer_option(current_trace, cmp, neg); + ret = set_tracer_option(tr->current_trace, cmp, neg); mutex_unlock(&trace_types_lock); @@ -2960,6 +3019,8 @@ static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; int ret; @@ -2971,7 +3032,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - ret = trace_set_options(buf); + ret = trace_set_options(tr, buf); if (ret < 0) return ret; @@ -2984,7 +3045,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) { if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_trace_options_show, NULL); + + return single_open(file, tracing_trace_options_show, inode->i_private); } static const struct file_operations tracing_iter_fops = { @@ -3082,11 +3144,12 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", current_trace->name); + r = sprintf(buf, "%s\n", tr->current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3130,7 +3193,8 @@ static int resize_buffer_duplicate_size(struct trace_array *tr, return ret; } -static int __tracing_resize_ring_buffer(unsigned long size, int cpu) +static int __tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu) { int ret; @@ -3142,20 +3206,20 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) ring_buffer_expanded = 1; /* May be called before buffers are initialized */ - if (!global_trace.buffer) + if (!tr->buffer) return 0; - ret = ring_buffer_resize(global_trace.buffer, size, cpu); + ret = ring_buffer_resize(tr->buffer, size, cpu); if (ret < 0) return ret; - if (!current_trace->use_max_tr) + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || + !tr->current_trace->use_max_tr) goto out; ret = ring_buffer_resize(max_tr.buffer, size, cpu); if (ret < 0) { - int r = resize_buffer_duplicate_size(&global_trace, - &global_trace, cpu); + int r = resize_buffer_duplicate_size(tr, tr, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3184,14 +3248,15 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) out: if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&global_trace, size); + set_buffer_entries(tr, size); else - global_trace.data[cpu]->entries = size; + tr->data[cpu]->entries = size; return ret; } -static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) { int ret = size; @@ -3205,7 +3270,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) } } - ret = __tracing_resize_ring_buffer(size, cpu_id); + ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; @@ -3232,7 +3297,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size, + ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); @@ -3242,7 +3307,7 @@ int tracing_update_buffers(void) struct trace_option_dentry; static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); @@ -3258,7 +3323,7 @@ static int tracing_set_tracer(const char *buf) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size, + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; @@ -3273,18 +3338,18 @@ static int tracing_set_tracer(const char *buf) ret = -EINVAL; goto out; } - if (t == current_trace) + if (t == tr->current_trace) goto out; trace_branch_disable(); - current_trace->enabled = false; + tr->current_trace->enabled = false; - if (current_trace->reset) - current_trace->reset(tr); + if (tr->current_trace->reset) + tr->current_trace->reset(tr); - had_max_tr = current_trace->allocated_snapshot; - current_trace = &nop_trace; + had_max_tr = tr->current_trace->allocated_snapshot; + tr->current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { /* @@ -3303,11 +3368,11 @@ static int tracing_set_tracer(const char *buf) ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&max_tr, 1); tracing_reset_online_cpus(&max_tr); - current_trace->allocated_snapshot = false; + tr->current_trace->allocated_snapshot = false; } destroy_trace_option_files(topts); - topts = create_trace_option_files(t); + topts = create_trace_option_files(tr, t); if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ ret = resize_buffer_duplicate_size(&max_tr, &global_trace, @@ -3323,8 +3388,8 @@ static int tracing_set_tracer(const char *buf) goto out; } - current_trace = t; - current_trace->enabled = true; + tr->current_trace = t; + tr->current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3398,7 +3463,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, static int tracing_open_pipe(struct inode *inode, struct file *filp) { - long cpu_file = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int ret = 0; @@ -3423,7 +3489,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3440,8 +3506,8 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - iter->cpu_file = cpu_file; - iter->tr = &global_trace; + iter->cpu_file = tc->cpu; + iter->tr = tc->tr; mutex_init(&iter->mutex); filp->private_data = iter; @@ -3563,6 +3629,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; + struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -3574,8 +3641,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); /* @@ -3731,6 +3798,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; + struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -3740,8 +3808,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); @@ -3803,43 +3871,19 @@ out_err: goto out; } -struct ftrace_entries_info { - struct trace_array *tr; - int cpu; -}; - -static int tracing_entries_open(struct inode *inode, struct file *filp) -{ - struct ftrace_entries_info *info; - - if (tracing_disabled) - return -ENODEV; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - info->tr = &global_trace; - info->cpu = (unsigned long)inode->i_private; - - filp->private_data = info; - - return 0; -} - static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_entries_info *info = filp->private_data; - struct trace_array *tr = info->tr; + struct trace_cpu *tc = filp->private_data; + struct trace_array *tr = tc->tr; char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); - if (info->cpu == RING_BUFFER_ALL_CPUS) { + if (tc->cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; @@ -3866,7 +3910,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); + r = sprintf(buf, "%lu\n", tr->data[tc->cpu]->entries >> 10); mutex_unlock(&trace_types_lock); @@ -3878,7 +3922,7 @@ static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_entries_info *info = filp->private_data; + struct trace_cpu *tc = filp->private_data; unsigned long val; int ret; @@ -3893,7 +3937,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - ret = tracing_resize_ring_buffer(val, info->cpu); + ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); if (ret < 0) return ret; @@ -3902,16 +3946,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } -static int -tracing_entries_release(struct inode *inode, struct file *filp) -{ - struct ftrace_entries_info *info = filp->private_data; - - kfree(info); - - return 0; -} - static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3953,11 +3987,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { + struct trace_array *tr = inode->i_private; + /* disable tracing ? */ if (trace_flags & TRACE_ITER_STOP_ON_FREE) tracing_off(); /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); + tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); return 0; } @@ -4068,13 +4104,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, static int tracing_clock_show(struct seq_file *m, void *v) { + struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", - i == trace_clock_id ? "[" : "", trace_clocks[i].name, - i == trace_clock_id ? "]" : ""); + i == tr->clock_id ? "[" : "", trace_clocks[i].name, + i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; @@ -4083,6 +4120,8 @@ static int tracing_clock_show(struct seq_file *m, void *v) static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; const char *clockstr; int i; @@ -4104,12 +4143,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - trace_clock_id = i; - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); - if (max_tr.buffer) + tr->clock_id = i; + + ring_buffer_set_clock(tr->buffer, trace_clocks[i].func); + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && max_tr.buffer) ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); /* @@ -4130,20 +4169,37 @@ static int tracing_clock_open(struct inode *inode, struct file *file) { if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_clock_show, NULL); + + return single_open(file, tracing_clock_show, inode->i_private); } #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { + struct trace_cpu *tc = inode->i_private; struct trace_iterator *iter; + struct seq_file *m; int ret = 0; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + return -ENOMEM; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + kfree(m); + return -ENOMEM; + } + iter->tr = tc->tr; + m->private = iter; + file->private_data = m; } + return ret; } @@ -4151,6 +4207,9 @@ static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; unsigned long val; int ret; @@ -4164,30 +4223,30 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&trace_types_lock); - if (current_trace->use_max_tr) { + if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto out; } switch (val) { case 0: - if (current_trace->allocated_snapshot) { + if (tr->current_trace->allocated_snapshot) { /* free spare buffer */ ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&max_tr, 1); tracing_reset_online_cpus(&max_tr); - current_trace->allocated_snapshot = false; + tr->current_trace->allocated_snapshot = false; } break; case 1: - if (!current_trace->allocated_snapshot) { + if (!tr->current_trace->allocated_snapshot) { /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&max_tr, &global_trace, RING_BUFFER_ALL_CPUS); if (ret < 0) break; - current_trace->allocated_snapshot = true; + tr->current_trace->allocated_snapshot = true; } local_irq_disable(); @@ -4196,7 +4255,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, local_irq_enable(); break; default: - if (current_trace->allocated_snapshot) + if (tr->current_trace->allocated_snapshot) tracing_reset_online_cpus(&max_tr); break; } @@ -4209,6 +4268,22 @@ out: mutex_unlock(&trace_types_lock); return ret; } + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + if (file->f_mode & FMODE_READ) + return tracing_release(inode, file); + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4236,10 +4311,9 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_entries_open, + .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, - .release = tracing_entries_release, .llseek = generic_file_llseek, }; @@ -4274,7 +4348,7 @@ static const struct file_operations snapshot_fops = { .read = seq_read, .write = tracing_snapshot_write, .llseek = tracing_seek, - .release = tracing_release, + .release = tracing_snapshot_release, }; #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4287,7 +4361,8 @@ struct ftrace_buffer_info { static int tracing_buffers_open(struct inode *inode, struct file *filp) { - int cpu = (int)(long)inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct ftrace_buffer_info *info; if (tracing_disabled) @@ -4297,8 +4372,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (!info) return -ENOMEM; - info->tr = &global_trace; - info->cpu = cpu; + info->tr = tr; + info->cpu = tc->cpu; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; @@ -4535,12 +4610,13 @@ static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - unsigned long cpu = (unsigned long)filp->private_data; - struct trace_array *tr = &global_trace; + struct trace_cpu *tc = filp->private_data; + struct trace_array *tr = tc->tr; struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; + int cpu = tc->cpu; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4636,58 +4712,57 @@ static const struct file_operations tracing_dyn_info_fops = { }; #endif -static struct dentry *d_tracer; - -struct dentry *tracing_init_dentry(void) +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) { static int once; - if (d_tracer) - return d_tracer; + if (tr->dir) + return tr->dir; if (!debugfs_initialized()) return NULL; - d_tracer = debugfs_create_dir("tracing", NULL); + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + tr->dir = debugfs_create_dir("tracing", NULL); - if (!d_tracer && !once) { + if (!tr->dir && !once) { once = 1; pr_warning("Could not create debugfs directory 'tracing'\n"); return NULL; } - return d_tracer; + return tr->dir; } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ + return tracing_init_dentry_tr(&global_trace); +} -static struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { - static int once; struct dentry *d_tracer; - if (d_percpu) - return d_percpu; - - d_tracer = tracing_init_dentry(); + if (tr->percpu_dir) + return tr->percpu_dir; + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - d_percpu = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); - if (!d_percpu && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'per_cpu'\n"); - return NULL; - } + WARN_ONCE(!tr->percpu_dir, + "Could not create debugfs directory 'per_cpu/%d'\n", cpu); - return d_percpu; + return tr->percpu_dir; } -static void tracing_init_debugfs_percpu(long cpu) +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct dentry *d_percpu = tracing_dentry_percpu(); + struct trace_array_cpu *data = tr->data[cpu]; + struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -4703,20 +4778,20 @@ static void tracing_init_debugfs_percpu(long cpu) /* per cpu trace_pipe */ trace_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); + (void *)&data->trace_cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); + (void *)&data->trace_cpu, &tracing_fops); trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); + (void *)&data->trace_cpu, &tracing_buffers_fops); trace_create_file("stats", 0444, d_cpu, - (void *) cpu, &tracing_stats_fops); + (void *)&data->trace_cpu, &tracing_stats_fops); trace_create_file("buffer_size_kb", 0444, d_cpu, - (void *) cpu, &tracing_entries_fops); + (void *)&data->trace_cpu, &tracing_entries_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -4727,6 +4802,7 @@ static void tracing_init_debugfs_percpu(long cpu) struct trace_option_dentry { struct tracer_opt *opt; struct tracer_flags *flags; + struct trace_array *tr; struct dentry *entry; }; @@ -4762,7 +4838,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(current_trace, topt->flags, + ret = __set_tracer_option(topt->tr->current_trace, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) @@ -4801,6 +4877,7 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = &global_trace; long index = (long)filp->private_data; unsigned long val; int ret; @@ -4813,7 +4890,7 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; mutex_lock(&trace_types_lock); - ret = set_tracer_flag(1 << index, val); + ret = set_tracer_flag(tr, 1 << index, val); mutex_unlock(&trace_types_lock); if (ret < 0) @@ -4847,40 +4924,41 @@ struct dentry *trace_create_file(const char *name, } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; - static struct dentry *t_options; - if (t_options) - return t_options; + if (tr->options) + return tr->options; - d_tracer = tracing_init_dentry(); + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - t_options = debugfs_create_dir("options", d_tracer); - if (!t_options) { + tr->options = debugfs_create_dir("options", d_tracer); + if (!tr->options) { pr_warning("Could not create debugfs directory 'options'\n"); return NULL; } - return t_options; + return tr->options; } static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, + struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; + topt->tr = tr; topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); @@ -4888,7 +4966,7 @@ create_trace_option_file(struct trace_option_dentry *topt, } static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct tracer_flags *flags; @@ -4913,7 +4991,7 @@ create_trace_option_files(struct tracer *tracer) return NULL; for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(&topts[cnt], flags, + create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); return topts; @@ -4936,11 +5014,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts) } static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, + const char *option, long index) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; @@ -4948,17 +5027,17 @@ create_trace_option_core_file(const char *option, long index) &trace_options_core_fops); } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; int i; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(trace_options[i], i); + create_trace_option_core_file(tr, trace_options[i], i); } static ssize_t @@ -4997,12 +5076,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (val) { ring_buffer_record_on(buffer); - if (current_trace->start) - current_trace->start(tr); + if (tr->current_trace->start) + tr->current_trace->start(tr); } else { ring_buffer_record_off(buffer); - if (current_trace->stop) - current_trace->stop(tr); + if (tr->current_trace->stop) + tr->current_trace->stop(tr); } mutex_unlock(&trace_types_lock); } @@ -5019,6 +5098,38 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ + + trace_create_file("trace_options", 0644, d_tracer, + tr, &tracing_iter_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *)&tr->trace_cpu, &tracing_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, + (void *)&tr->trace_cpu, &tracing_pipe_fops); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + (void *)&tr->trace_cpu, &tracing_entries_fops); + + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + tr, &tracing_total_entries_fops); + + trace_create_file("free_buffer", 0644, d_tracer, + tr, &tracing_free_buffer_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + tr, &tracing_mark_fops); + + trace_create_file("trace_clock", 0644, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("tracing_on", 0644, d_tracer, + tr, &rb_simple_fops); +} + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -5028,14 +5139,10 @@ static __init int tracer_init_debugfs(void) d_tracer = tracing_init_dentry(); - trace_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); + init_tracer_debugfs(&global_trace, d_tracer); trace_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - - trace_create_file("trace", 0644, d_tracer, - (void *) RING_BUFFER_ALL_CPUS, &tracing_fops); + &global_trace, &tracing_cpumask_fops); trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); @@ -5054,30 +5161,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); - trace_create_file("trace_pipe", 0444, d_tracer, - (void *) RING_BUFFER_ALL_CPUS, &tracing_pipe_fops); - - trace_create_file("buffer_size_kb", 0644, d_tracer, - (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); - - trace_create_file("buffer_total_size_kb", 0444, d_tracer, - &global_trace, &tracing_total_entries_fops); - - trace_create_file("free_buffer", 0644, d_tracer, - &global_trace, &tracing_free_buffer_fops); - - trace_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("trace_clock", 0644, d_tracer, NULL, - &trace_clock_fops); - - trace_create_file("tracing_on", 0644, d_tracer, - &global_trace, &rb_simple_fops); - #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -5085,13 +5171,13 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_tracer, - (void *) RING_BUFFER_ALL_CPUS, &snapshot_fops); + (void *)&global_trace.trace_cpu, &snapshot_fops); #endif - create_trace_options_dir(); + create_trace_options_dir(&global_trace); for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(cpu); + tracing_init_debugfs_percpu(&global_trace, cpu); return 0; } @@ -5161,7 +5247,7 @@ trace_printk_seq(struct trace_seq *s) void trace_init_global_iter(struct trace_iterator *iter) { iter->tr = &global_trace; - iter->trace = current_trace; + iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; } @@ -5315,6 +5401,8 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); + raw_spin_lock_init(&global_trace.start_lock); + /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); if (!global_trace.buffer) { @@ -5328,6 +5416,7 @@ __init static int tracer_alloc_buffers(void) #ifdef CONFIG_TRACER_MAX_TRACE max_tr.buffer = ring_buffer_alloc(1, rb_flags); + raw_spin_lock_init(&max_tr.start_lock); if (!max_tr.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); @@ -5339,7 +5428,11 @@ __init static int tracer_alloc_buffers(void) /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { global_trace.data[i] = &per_cpu(global_trace_cpu, i); + global_trace.data[i]->trace_cpu.cpu = i; + global_trace.data[i]->trace_cpu.tr = &global_trace; max_tr.data[i] = &per_cpu(max_tr_data, i); + max_tr.data[i]->trace_cpu.cpu = i; + max_tr.data[i]->trace_cpu.tr = &max_tr; } set_buffer_entries(&global_trace, @@ -5353,6 +5446,8 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); + global_trace.current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -5363,6 +5458,10 @@ __init static int tracer_alloc_buffers(void) global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + /* Holder for file callbacks */ + global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; + global_trace.trace_cpu.tr = &global_trace; + INIT_LIST_HEAD(&global_trace.systems); INIT_LIST_HEAD(&global_trace.events); list_add(&global_trace.list, &ftrace_trace_arrays); @@ -5371,7 +5470,7 @@ __init static int tracer_alloc_buffers(void) char *option; option = strsep(&trace_boot_options, ","); - trace_set_options(option); + trace_set_options(&global_trace, option); } return 0; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index da09a037abcd..b80fbcf70af4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -127,12 +127,21 @@ enum trace_flag_type { #define TRACE_BUF_SIZE 1024 +struct trace_array; + +struct trace_cpu { + struct trace_array *tr; + struct dentry *dir; + int cpu; +}; + /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { + struct trace_cpu trace_cpu; atomic_t disabled; void *buffer_page; /* ring buffer spare */ @@ -151,6 +160,8 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; +struct tracer; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -161,9 +172,16 @@ struct trace_array { struct list_head list; int cpu; int buffer_disabled; + struct trace_cpu trace_cpu; /* place holder */ + int stop_count; + int clock_id; + struct tracer *current_trace; unsigned int flags; cycle_t time_start; + raw_spinlock_t start_lock; struct dentry *dir; + struct dentry *options; + struct dentry *percpu_dir; struct dentry *event_dir; struct list_head systems; struct list_head events; @@ -474,6 +492,7 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; @@ -979,7 +998,7 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); -int set_tracer_flag(unsigned int mask, int enabled); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 443b25b43b4f..b3cf6bf308ef 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -561,8 +561,8 @@ static void __irqsoff_tracer_init(struct trace_array *tr) save_flags = trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; @@ -581,8 +581,8 @@ static void irqsoff_tracer_reset(struct trace_array *tr) stop_irqsoff_tracer(tr, is_graph()); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index fde652c9a511..5255a8477247 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -543,8 +543,8 @@ static int __wakeup_tracer_init(struct trace_array *tr) save_flags = trace_flags; /* non overwrite screws up the latency tracers */ - set_tracer_flag(TRACE_ITER_OVERWRITE, 1); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -573,8 +573,8 @@ static void wakeup_tracer_reset(struct trace_array *tr) /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag); - set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) -- cgit v1.2.3 From ccb469a198cffac94a7eea0b69f715f06e2ddf15 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Thu, 2 Aug 2012 10:32:10 -0400 Subject: tracing: Pass the ftrace_file to the buffer lock reserve code Pass the struct ftrace_event_file *ftrace_file to the trace_event_buffer_lock_reserve() (new function that replaces the trace_current_buffer_lock_reserver()). The ftrace_file holds a pointer to the trace_array that is in use. In the case of multiple buffers with different trace_arrays, this allows different events to be recorded into different buffers. Also fixed some of the stale comments in include/trace/ftrace.h Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace_event.h | 7 +++++++ include/trace/ftrace.h | 9 +++++---- kernel/trace/trace.c | 12 ++++++++++++ 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c7191d482f98..fd28c170c597 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -128,6 +128,13 @@ enum print_line_t { void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc); +struct ftrace_event_file; + +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer, int type, unsigned long len, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 191d9661e277..e5d140a91fd7 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -414,7 +414,8 @@ static inline notrace int ftrace_get_offsets_##call( \ * * static void ftrace_raw_event_<call>(void *__data, proto) * { - * struct ftrace_event_call *event_call = __data; + * struct ftrace_event_file *ftrace_file = __data; + * struct ftrace_event_call *event_call = ftrace_file->event_call; * struct ftrace_data_offsets_<call> __maybe_unused __data_offsets; * struct ring_buffer_event *event; * struct ftrace_raw_<call> *entry; <-- defined in stage 1 @@ -428,7 +429,7 @@ static inline notrace int ftrace_get_offsets_##call( \ * * __data_size = ftrace_get_offsets_<call>(&__data_offsets, args); * - * event = trace_current_buffer_lock_reserve(&buffer, + * event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, * event_<call>->event.type, * sizeof(*entry) + __data_size, * irq_flags, pc); @@ -440,7 +441,7 @@ static inline notrace int ftrace_get_offsets_##call( \ * __array macros. * * if (!filter_current_check_discard(buffer, event_call, entry, event)) - * trace_current_buffer_unlock_commit(buffer, + * trace_nowake_buffer_unlock_commit(buffer, * event, irq_flags, pc); * } * @@ -533,7 +534,7 @@ ftrace_raw_event_##call(void *__data, proto) \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ \ - event = trace_current_buffer_lock_reserve(&buffer, \ + event = trace_event_buffer_lock_reserve(&buffer, ftrace_file, \ event_call->event.type, \ sizeof(*entry) + __data_size, \ irq_flags, pc); \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 91fe40905828..29bff72f97ef 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1293,6 +1293,18 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); +struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc) +{ + *current_rb = ftrace_file->tr->buffer; + return trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, int type, unsigned long len, -- cgit v1.2.3 From a7603ff4b5f7e26e67af82a4c3d05eeeb8d7b160 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Mon, 6 Aug 2012 16:24:11 -0400 Subject: tracing: Replace the static global per_cpu arrays with allocated per_cpu The global and max-tr currently use static per_cpu arrays for the CPU data descriptors. But in order to get new allocated trace_arrays, they need to be allocated per_cpu arrays. Instead of using the static arrays, switch the global and max-tr to use allocated data. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 92 +++++++++++++++++++++--------------- kernel/trace/trace.h | 2 +- kernel/trace/trace_branch.c | 6 ++- kernel/trace/trace_functions.c | 4 +- kernel/trace/trace_functions_graph.c | 4 +- kernel/trace/trace_irqsoff.c | 6 +-- kernel/trace/trace_kdb.c | 4 +- kernel/trace/trace_mmiotrace.c | 4 +- kernel/trace/trace_sched_switch.c | 4 +- kernel/trace/trace_sched_wakeup.c | 14 +++--- 10 files changed, 79 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 29bff72f97ef..406adbc277a0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -191,8 +191,6 @@ static struct trace_array global_trace; LIST_HEAD(ftrace_trace_arrays); -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); - int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, struct ring_buffer_event *event) @@ -227,8 +225,6 @@ cycle_t ftrace_now(int cpu) */ static struct trace_array max_tr; -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); - int tracing_is_enabled(void) { return tracing_is_on(); @@ -666,13 +662,13 @@ unsigned long __read_mostly tracing_max_latency; static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; + struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu); struct trace_array_cpu *max_data; max_tr.cpu = cpu; max_tr.time_start = data->preempt_timestamp; - max_data = max_tr.data[cpu]; + max_data = per_cpu_ptr(max_tr.data, cpu); max_data->saved_latency = tracing_max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -1984,7 +1980,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) unsigned long entries = 0; u64 ts; - tr->data[cpu]->skipped_entries = 0; + per_cpu_ptr(tr->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) @@ -2004,7 +2000,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) ring_buffer_read(buf_iter, NULL); } - tr->data[cpu]->skipped_entries = entries; + per_cpu_ptr(tr->data, cpu)->skipped_entries = entries; } /* @@ -2099,8 +2095,8 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e * entries for the trace and we need to ignore the * ones before the time stamp. */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; + if (per_cpu_ptr(tr->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(tr->data, cpu)->skipped_entries; /* total is the same as the entries */ *total += count; } else @@ -2157,7 +2153,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[tr->cpu]; + struct trace_array_cpu *data = per_cpu_ptr(tr->data, tr->cpu); struct tracer *type = iter->trace; unsigned long entries; unsigned long total; @@ -2227,7 +2223,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; - if (iter->tr->data[iter->cpu]->skipped_entries) + if (per_cpu_ptr(iter->tr->data, iter->cpu)->skipped_entries) return; cpumask_set_cpu(iter->cpu, iter->started); @@ -2858,12 +2854,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, */ if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(tr->data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&tr->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(tr->data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->buffer, cpu); } } @@ -3177,7 +3173,7 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val) { int cpu; for_each_tracing_cpu(cpu) - tr->data[cpu]->entries = val; + per_cpu_ptr(tr->data, cpu)->entries = val; } /* resize @tr's buffer to the size of @size_tr's entries */ @@ -3189,17 +3185,18 @@ static int resize_buffer_duplicate_size(struct trace_array *tr, if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { ret = ring_buffer_resize(tr->buffer, - size_tr->data[cpu]->entries, cpu); + per_cpu_ptr(size_tr->data, cpu)->entries, cpu); if (ret < 0) break; - tr->data[cpu]->entries = size_tr->data[cpu]->entries; + per_cpu_ptr(tr->data, cpu)->entries = + per_cpu_ptr(size_tr->data, cpu)->entries; } } else { ret = ring_buffer_resize(tr->buffer, - size_tr->data[cpu_id]->entries, cpu_id); + per_cpu_ptr(size_tr->data, cpu_id)->entries, cpu_id); if (ret == 0) - tr->data[cpu_id]->entries = - size_tr->data[cpu_id]->entries; + per_cpu_ptr(tr->data, cpu_id)->entries = + per_cpu_ptr(size_tr->data, cpu_id)->entries; } return ret; @@ -3256,13 +3253,13 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, if (cpu == RING_BUFFER_ALL_CPUS) set_buffer_entries(&max_tr, size); else - max_tr.data[cpu]->entries = size; + per_cpu_ptr(max_tr.data, cpu)->entries = size; out: if (cpu == RING_BUFFER_ALL_CPUS) set_buffer_entries(tr, size); else - tr->data[cpu]->entries = size; + per_cpu_ptr(tr->data, cpu)->entries = size; return ret; } @@ -3905,8 +3902,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf, for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) - size = tr->data[cpu]->entries; - if (size != tr->data[cpu]->entries) { + size = per_cpu_ptr(tr->data, cpu)->entries; + if (size != per_cpu_ptr(tr->data, cpu)->entries) { buf_size_same = 0; break; } @@ -3922,7 +3919,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", tr->data[tc->cpu]->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->data, tc->cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -3969,7 +3966,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += tr->data[cpu]->entries >> 10; + size += per_cpu_ptr(tr->data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -4773,7 +4770,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) static void tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct trace_array_cpu *data = tr->data[cpu]; + struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu); struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -5298,7 +5295,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5366,7 +5363,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled); } tracing_on(); } @@ -5422,11 +5419,31 @@ __init static int tracer_alloc_buffers(void) WARN_ON(1); goto out_free_cpumask; } + + global_trace.data = alloc_percpu(struct trace_array_cpu); + + if (!global_trace.data) { + printk(KERN_ERR "tracer: failed to allocate percpu memory!\n"); + WARN_ON(1); + goto out_free_cpumask; + } + + for_each_tracing_cpu(i) { + memset(per_cpu_ptr(global_trace.data, i), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(global_trace.data, i)->trace_cpu.cpu = i; + per_cpu_ptr(global_trace.data, i)->trace_cpu.tr = &global_trace; + } + if (global_trace.buffer_disabled) tracing_off(); - #ifdef CONFIG_TRACER_MAX_TRACE + max_tr.data = alloc_percpu(struct trace_array_cpu); + if (!max_tr.data) { + printk(KERN_ERR "tracer: failed to allocate percpu memory!\n"); + WARN_ON(1); + goto out_free_cpumask; + } max_tr.buffer = ring_buffer_alloc(1, rb_flags); raw_spin_lock_init(&max_tr.start_lock); if (!max_tr.buffer) { @@ -5435,18 +5452,15 @@ __init static int tracer_alloc_buffers(void) ring_buffer_free(global_trace.buffer); goto out_free_cpumask; } -#endif - /* Allocate the first page for all buffers */ for_each_tracing_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); - global_trace.data[i]->trace_cpu.cpu = i; - global_trace.data[i]->trace_cpu.tr = &global_trace; - max_tr.data[i] = &per_cpu(max_tr_data, i); - max_tr.data[i]->trace_cpu.cpu = i; - max_tr.data[i]->trace_cpu.tr = &max_tr; + memset(per_cpu_ptr(max_tr.data, i), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(max_tr.data, i)->trace_cpu.cpu = i; + per_cpu_ptr(max_tr.data, i)->trace_cpu.tr = &max_tr; } +#endif + /* Allocate the first page for all buffers */ set_buffer_entries(&global_trace, ring_buffer_size(global_trace.buffer, 0)); #ifdef CONFIG_TRACER_MAX_TRACE @@ -5488,6 +5502,8 @@ __init static int tracer_alloc_buffers(void) return 0; out_free_cpumask: + free_percpu(global_trace.data); + free_percpu(max_tr.data); free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b80fbcf70af4..15ccd7cd1560 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -186,7 +186,7 @@ struct trace_array { struct list_head systems; struct list_head events; struct task_struct *waiter; - struct trace_array_cpu *data[NR_CPUS]; + struct trace_array_cpu *data; }; enum { diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed29..6dadbefbb1d6 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; + struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; struct ring_buffer *buffer; @@ -51,7 +52,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) local_irq_save(flags); cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + data = per_cpu_ptr(tr->data, cpu); + if (atomic_inc_return(&data->disabled) != 1) goto out; pc = preempt_count(); @@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) __buffer_unlock_commit(buffer, event); out: - atomic_dec(&tr->data[cpu]->disabled); + atomic_dec(&data->disabled); local_irq_restore(flags); } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 601152523326..9d73861efc6a 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, goto out; cpu = smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, */ local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389cc..ca986d61a282 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index b3cf6bf308ef..9b52f9cf7a0d 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -121,7 +121,7 @@ static int func_prolog_dec(struct trace_array *tr, if (!irqs_disabled_flags(*flags)) return 0; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) @@ -380,7 +380,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (per_cpu(tracing_cpu, cpu)) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->data, cpu); if (unlikely(!data) || atomic_read(&data->disabled)) return; @@ -418,7 +418,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) if (!tracer_enabled) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->data, cpu); if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index cc1dbdc5ee5d..349f6941e8f2 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled); } old_userobj = trace_flags; @@ -83,7 +83,7 @@ out: trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled); } for_each_tracing_cpu(cpu) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e5..2472f6f76b50 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = tr->data[smp_processor_id()]; + struct trace_array_cpu *data = per_cpu_ptr(tr->data, smp_processor_id()); __trace_mmiotrace_rw(tr, data, rw); } @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) struct trace_array_cpu *data; preempt_disable(); - data = tr->data[smp_processor_id()]; + data = per_cpu_ptr(tr->data, smp_processor_id()); __trace_mmiotrace_map(tr, data, map); preempt_enable(); } diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd8..1ffe39abd6fc 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5255a8477247..f9ceb75a95b7 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -89,7 +89,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (cpu != wakeup_current_cpu) goto out_enable; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; @@ -353,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -365,7 +365,7 @@ probe_wakeup_sched_switch(void *ignore, goto out_unlock; /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +387,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -435,7 +435,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) return; pc = preempt_count(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -458,7 +458,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) local_save_flags(flags); - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +472,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) -- cgit v1.2.3 From 12ab74ee00d154bc05ea2fc659b7ce6519e5d5a6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Wed, 8 Aug 2012 14:48:20 -0400 Subject: tracing: Make syscall events suitable for multiple buffers Currently the syscall events record into the global buffer. But if multiple buffers are in place, then we need to have syscall events record in the proper buffers. By adding descriptors to pass to the syscall event functions, the syscall events can now record into the buffers that have been assigned to them (one event may be applied to mulitple buffers). This will allow tracing high volume syscalls along with seldom occurring syscalls without losing the seldom syscall events. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.h | 11 ++++++ kernel/trace/trace_syscalls.c | 80 +++++++++++++++++++++++++------------------ 2 files changed, 57 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 15ccd7cd1560..68cad7a9e089 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,11 @@ #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h> /* For NR_SYSCALLS */ +#include <asm/syscall.h> /* some archs define it here */ +#endif + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -173,6 +178,12 @@ struct trace_array { int cpu; int buffer_disabled; struct trace_cpu trace_cpu; /* place holder */ +#ifdef CONFIG_FTRACE_SYSCALLS + int sys_refcount_enter; + int sys_refcount_exit; + DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); + DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +#endif int stop_count; int clock_id; struct tracer *current_trace; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e321058..a842783ad6be 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -12,10 +12,6 @@ #include "trace.h" static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, enum trace_reg type, void *data); @@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { + struct trace_array *tr = data; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_enter_syscalls)) + if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, sys_data->enter_event->event.type, size, 0, 0); if (!event) return; @@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { + struct trace_array *tr = data; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_exit_syscalls)) + if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) return; @@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); + if (!tr->sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, enabled_enter_syscalls); - sys_refcount_enter++; + set_bit(num, tr->enabled_enter_syscalls); + tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -static void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_enter--; - clear_bit(num, enabled_enter_syscalls); - if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter, NULL); + tr->sys_refcount_enter--; + clear_bit(num, tr->enabled_enter_syscalls); + if (!tr->sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); } -static int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); + if (!tr->sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, enabled_exit_syscalls); - sys_refcount_exit++; + set_bit(num, tr->enabled_exit_syscalls); + tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -static void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_exit--; - clear_bit(num, enabled_exit_syscalls); - if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit, NULL); + tr->sys_refcount_exit--; + clear_bit(num, tr->enabled_exit_syscalls); + if (!tr->sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } @@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call) static int syscall_enter_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_enter(event); + return reg_event_syscall_enter(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_enter(event); + unreg_event_syscall_enter(file, event); return 0; #ifdef CONFIG_PERF_EVENTS @@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event, static int syscall_exit_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_exit(event); + return reg_event_syscall_exit(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_exit(event); + unreg_event_syscall_exit(file, event); return 0; #ifdef CONFIG_PERF_EVENTS -- cgit v1.2.3 From 277ba04461c2746cf935353474c0961161951b68 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Fri, 3 Aug 2012 16:10:49 -0400 Subject: tracing: Add interface to allow multiple trace buffers Add the interface ("instances" directory) to add multiple buffers to ftrace. To create a new instance, simply do a mkdir in the instances directory: This will create a directory with the following: # cd instances # mkdir foo # ls foo buffer_size_kb free_buffer trace_clock trace_pipe buffer_total_size_kb set_event trace_marker tracing_enabled events/ trace trace_options tracing_on Currently only events are able to be set, and there isn't a way to delete a buffer when one is created (yet). Note, the i_mutex lock is dropped from the parent "instances" directory during the mkdir operation. As the "instances" directory can not be renamed or deleted (created on boot), I do not see any harm in dropping the lock. The creation of the sub directories is protected by trace_types_lock mutex, which only lets one instance get into the code path at a time. If two tasks try to create or delete directories of the same name, only one will occur and the other will fail with -EEXIST. Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 129 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 2 + kernel/trace/trace_events.c | 12 ++++- 3 files changed, 142 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 406adbc277a0..07a63114d938 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5107,6 +5107,133 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static int new_instance_create(const char *name) +{ + enum ring_buffer_flags rb_flags; + struct trace_array *tr; + int ret; + int i; + + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + ret = -ENOMEM; + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out_unlock; + + tr->name = kstrdup(name, GFP_KERNEL); + if (!tr->name) + goto out_free_tr; + + raw_spin_lock_init(&tr->start_lock); + + tr->current_trace = &nop_trace; + + INIT_LIST_HEAD(&tr->systems); + INIT_LIST_HEAD(&tr->events); + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + tr->buffer = ring_buffer_alloc(trace_buf_size, rb_flags); + if (!tr->buffer) + goto out_free_tr; + + tr->data = alloc_percpu(struct trace_array_cpu); + if (!tr->data) + goto out_free_tr; + + for_each_tracing_cpu(i) { + memset(per_cpu_ptr(tr->data, i), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(tr->data, i)->trace_cpu.cpu = i; + per_cpu_ptr(tr->data, i)->trace_cpu.tr = tr; + } + + /* Holder for file callbacks */ + tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; + tr->trace_cpu.tr = tr; + + tr->dir = debugfs_create_dir(name, trace_instance_dir); + if (!tr->dir) + goto out_free_tr; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) + goto out_free_tr; + + init_tracer_debugfs(tr, tr->dir); + + list_add(&tr->list, &ftrace_trace_arrays); + + mutex_unlock(&trace_types_lock); + + return 0; + + out_free_tr: + if (tr->buffer) + ring_buffer_free(tr->buffer); + kfree(tr->name); + kfree(tr); + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; + +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the new_instance_create() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = new_instance_create(dentry->d_iname); + + mutex_lock(&inode->i_mutex); + + return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = instance_mkdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ + trace_instance_dir = debugfs_create_dir("instances", d_tracer); + if (WARN_ON(!trace_instance_dir)) + return; + + /* Hijack the dir inode operations, to allow mkdir */ + trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + static void init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { @@ -5183,6 +5310,8 @@ static __init int tracer_init_debugfs(void) (void *)&global_trace.trace_cpu, &snapshot_fops); #endif + create_trace_instances(d_tracer); + create_trace_options_dir(&global_trace); for_each_tracing_cpu(cpu) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 68cad7a9e089..883fe0b62f0a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -175,6 +175,7 @@ struct tracer; struct trace_array { struct ring_buffer *buffer; struct list_head list; + char *name; int cpu; int buffer_disabled; struct trace_cpu trace_cpu; /* place holder */ @@ -999,6 +1000,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, } extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 439955239bae..58a61302a733 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1754,16 +1754,22 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; + mutex_lock(&event_mutex); + entry = debugfs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warning("Could not create debugfs 'set_event' entry\n"); + mutex_unlock(&event_mutex); return -ENOMEM; } d_events = debugfs_create_dir("events", parent); - if (!d_events) + if (!d_events) { pr_warning("Could not create debugfs 'events' directory\n"); + mutex_unlock(&event_mutex); + return -ENOMEM; + } /* ring buffer internal formats */ trace_create_file("header_page", 0444, d_events, @@ -1778,7 +1784,11 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) tr, &ftrace_tr_enable_fops); tr->event_dir = d_events; + down_write(&trace_event_mutex); __trace_add_event_dirs(tr); + up_write(&trace_event_mutex); + + mutex_unlock(&event_mutex); return 0; } -- cgit v1.2.3 From 0c8916c34203734d3b05953ebace52d7c2969f16 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Tue, 7 Aug 2012 16:14:16 -0400 Subject: tracing: Add rmdir to remove multibuffer instances Add a method to the hijacked dentry descriptor of the "instances" directory to allow for rmdir to remove an instance of a multibuffer. Example: cd /debug/tracing/instances mkdir hello ls hello/ rmdir hello ls Like the mkdir method, the i_mutex is dropped for the instances directory. The instances directory is created at boot up and can not be renamed or removed. The trace_types_lock mutex is used to synchronize adding and removing of instances. I've run several stress tests with different threads trying to create and delete directories of the same name, and it has stood up fine. Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 68 +++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + kernel/trace/trace_events.c | 33 ++++++++++++++++++++++ 3 files changed, 102 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 07a63114d938..ab3df804fa96 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5192,6 +5192,42 @@ static int new_instance_create(const char *name) } +static int instance_delete(const char *name) +{ + struct trace_array *tr; + int found = 0; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + found = 1; + break; + } + } + if (!found) + goto out_unlock; + + list_del(&tr->list); + + event_trace_del_tracer(tr); + debugfs_remove_recursive(tr->dir); + free_percpu(tr->data); + ring_buffer_free(tr->buffer); + + kfree(tr->name); + kfree(tr); + + ret = 0; + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; +} + static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) { struct dentry *parent; @@ -5219,9 +5255,41 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m return ret; } +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* The caller did a dget() on dentry */ + mutex_unlock(&dentry->d_inode->i_mutex); + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the instance_delete() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = instance_delete(dentry->d_iname); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + return ret; +} + static const struct inode_operations instance_dir_inode_operations = { .lookup = simple_lookup, .mkdir = instance_mkdir, + .rmdir = instance_rmdir, }; static __init void create_trace_instances(struct dentry *d_tracer) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 883fe0b62f0a..b825ea2d8c64 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1001,6 +1001,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, extern void trace_event_enable_cmd_record(bool enable); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); extern struct mutex event_mutex; extern struct list_head ftrace_events; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 58a61302a733..06d6bc275221 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1709,6 +1709,20 @@ __trace_add_event_dirs(struct trace_array *tr) } } +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file, *next; + + list_for_each_entry_safe(file, next, &tr->events, list) { + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kfree(file); + } +} + static void __add_event_to_tracers(struct ftrace_event_call *call, struct ftrace_module_file_ops *file_ops) @@ -1793,6 +1807,25 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) return 0; } +int event_trace_del_tracer(struct trace_array *tr) +{ + /* Disable any running events */ + __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + + mutex_lock(&event_mutex); + + down_write(&trace_event_mutex); + __trace_remove_event_dirs(tr); + debugfs_remove_recursive(tr->event_dir); + up_write(&trace_event_mutex); + + tr->event_dir = NULL; + + mutex_unlock(&event_mutex); + + return 0; +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); -- cgit v1.2.3 From 772482216f170ddc62fa92a3cc3271cdd1993525 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Wed, 27 Feb 2013 16:28:06 -0500 Subject: tracing: Get trace_events kernel command line working again With the new descriptors used to allow multiple buffers in the tracing directory added, the kernel command line parameter trace_events=... no longer works. This is because the top level (global) trace array now has a list of descriptors associated with the events and the files in the debugfs directory. But in early bootup, when the command line is processed and the events enabled, the trace array list of events has not been set up yet. Without the list of events in the trace array, the setting of events to record will fail because it would not match any events. The solution is to set up the top level array in two stages. The first is to just add the ftrace file descriptors that just point to the events. This will allow events to be enabled and start tracing. The second stage is called after the filesystem is set up, and this stage will create the debugfs event files and directories associated with the trace array events. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_events.c | 143 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 136 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 06d6bc275221..21fe83b4106a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1473,6 +1473,28 @@ __trace_add_new_event(struct ftrace_event_call *call, return event_create_dir(tr->event_dir, file, id, enable, filter, format); } +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = kzalloc(sizeof(*file), GFP_KERNEL); + if (!file) + return -ENOMEM; + + file->event_call = call; + file->tr = tr; + list_add(&file->list, &tr->events); + + return 0; +} + struct ftrace_module_file_ops; static void __add_event_to_tracers(struct ftrace_event_call *call, struct ftrace_module_file_ops *file_ops); @@ -1709,6 +1731,56 @@ __trace_add_event_dirs(struct trace_array *tr) } } +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file; + int ret; + + + list_for_each_entry(file, &tr->events, list) { + ret = event_create_dir(tr->event_dir, file, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + file->event_call->name); + } +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + /* Early boot up should not have any modules loaded */ + if (WARN_ON_ONCE(call->mod)) + continue; + + ret = __trace_early_add_new_event(call, tr); + if (ret < 0) + pr_warning("Could not create early event %s\n", + call->name); + } +} + /* Remove the event directory structure for a trace directory. */ static void __trace_remove_event_dirs(struct trace_array *tr) @@ -1763,25 +1835,23 @@ static __init int setup_trace_event(char *str) } __setup("trace_event=", setup_trace_event); -int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) { struct dentry *d_events; struct dentry *entry; - mutex_lock(&event_mutex); - entry = debugfs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { pr_warning("Could not create debugfs 'set_event' entry\n"); - mutex_unlock(&event_mutex); return -ENOMEM; } d_events = debugfs_create_dir("events", parent); if (!d_events) { pr_warning("Could not create debugfs 'events' directory\n"); - mutex_unlock(&event_mutex); return -ENOMEM; } @@ -1798,13 +1868,64 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) tr, &ftrace_tr_enable_fops); tr->event_dir = d_events; + + return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + down_write(&trace_event_mutex); __trace_add_event_dirs(tr); up_write(&trace_event_mutex); + out_unlock: mutex_unlock(&event_mutex); - return 0; + return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_mutex); + __trace_early_add_event_dirs(tr); + up_write(&trace_event_mutex); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; } int event_trace_del_tracer(struct trace_array *tr) @@ -1842,6 +1963,14 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + /* + * We need the top trace array to have a working set of trace + * points at early init, before the debug files and directories + * are created. Create the file entries now, and attach them + * to the actual file dentries later. + */ + __trace_early_add_events(tr); + while (true) { token = strsep(&buf, ","); @@ -1882,7 +2011,7 @@ static __init int event_trace_init(void) if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - ret = event_trace_add_tracer(d_tracer, tr); + ret = early_event_add_tracer(d_tracer, tr); if (ret) return ret; -- cgit v1.2.3 From d1a291437f75f6c841819b7855d95a21958cc822 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Wed, 27 Feb 2013 20:23:57 -0500 Subject: tracing: Use kmem_cache_alloc instead of kmalloc in trace_events.c The event structures used by the trace events are mostly persistent, but they are also allocated by kmalloc, which is not the best at allocating space for what is used. By converting these kmallocs into kmem_cache_allocs, we can save over 50K of space that is permanently allocated. After boot we have: slab name active allocated size --------- ------ --------- ---- ftrace_event_file 979 1005 56 67 1 ftrace_event_field 2301 2310 48 77 1 The ftrace_event_file has at boot up 979 active objects out of 1005 allocated in the slabs. Each object is 56 bytes. In a normal kmalloc, that would allocate 64 bytes for each object. 1005 - 979 = 26 objects not used 26 * 56 = 1456 bytes wasted But if we used kmalloc: 64 - 56 = 8 bytes unused per allocation 8 * 979 = 7832 bytes wasted 7832 - 1456 = 6376 bytes in savings Doing the same for ftrace_event_field where there's 2301 objects allocated in a slab that can hold 2310 with 48 bytes each we have: 2310 - 2301 = 9 objects not used 9 * 48 = 432 bytes wasted A kmalloc would also use 64 bytes per object: 64 - 48 = 16 bytes unused per allocation 16 * 2301 = 36816 bytes wasted! 36816 - 432 = 36384 bytes in savings This change gives us a total of 42760 bytes in savings. At least on my machine, but as there's a lot of these persistent objects for all configurations that use trace points, this is a net win. Thanks to Ezequiel Garcia for his trace_analyze presentation which pointed out the wasted space in my code. Cc: Ezequiel Garcia <elezegarcia@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_events.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 21fe83b4106a..5d8845d36fa8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -36,6 +36,11 @@ EXPORT_SYMBOL_GPL(event_storage); LIST_HEAD(ftrace_events); LIST_HEAD(ftrace_common_fields); +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + /* Double loops, do not use break, only goto's work */ #define do_for_each_event_file(tr, file) \ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ @@ -63,7 +68,7 @@ static int __trace_define_field(struct list_head *head, const char *type, { struct ftrace_event_field *field; - field = kzalloc(sizeof(*field), GFP_KERNEL); + field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) goto err; @@ -91,7 +96,7 @@ static int __trace_define_field(struct list_head *head, const char *type, err: if (field) kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); return -ENOMEM; } @@ -143,7 +148,7 @@ void trace_destroy_fields(struct ftrace_event_call *call) list_del(&field->link); kfree(field->type); kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); } } @@ -1383,7 +1388,7 @@ static void remove_event_from_tracers(struct ftrace_event_call *call) list_del(&file->list); debugfs_remove_recursive(file->dir); remove_subsystem(file->system); - kfree(file); + kmem_cache_free(file_cachep, file); /* * The do_for_each_event_file_safe() is @@ -1462,7 +1467,7 @@ __trace_add_new_event(struct ftrace_event_call *call, { struct ftrace_event_file *file; - file = kzalloc(sizeof(*file), GFP_KERNEL); + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return -ENOMEM; @@ -1484,7 +1489,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call, { struct ftrace_event_file *file; - file = kzalloc(sizeof(*file), GFP_KERNEL); + file = kmem_cache_alloc(file_cachep, GFP_TRACE); if (!file) return -ENOMEM; @@ -1791,7 +1796,7 @@ __trace_remove_event_dirs(struct trace_array *tr) list_del(&file->list); debugfs_remove_recursive(file->dir); remove_subsystem(file->system); - kfree(file); + kmem_cache_free(file_cachep, file); } } @@ -1947,6 +1952,13 @@ int event_trace_del_tracer(struct trace_array *tr) return 0; } +static __init int event_trace_memsetup(void) +{ + field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); + file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + return 0; +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); @@ -2021,6 +2033,7 @@ static __init int event_trace_init(void) return 0; } +early_initcall(event_trace_memsetup); core_initcall(event_trace_enable); fs_initcall(event_trace_init); -- cgit v1.2.3 From 92edca073c374f66b8eee20ec6426fb0cdb6c4d5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Wed, 27 Feb 2013 20:41:37 -0500 Subject: tracing: Use direct field, type and system names The names used to display the field and type in the event format files are copied, as well as the system name that is displayed. All these names are created by constant values passed in. If one of theses values were to be removed by a module, the module would also be required to remove any event it created. By using the strings directly, we can save over 100K of memory. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.h | 4 ++-- kernel/trace/trace_events.c | 20 +++----------------- 2 files changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b825ea2d8c64..e420f2a230de 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -887,8 +887,8 @@ enum { struct ftrace_event_field { struct list_head link; - char *name; - char *type; + const char *name; + const char *type; int filter_type; int offset; int size; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d8845d36fa8..63b4bdf84593 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -72,13 +72,8 @@ static int __trace_define_field(struct list_head *head, const char *type, if (!field) goto err; - field->name = kstrdup(name, GFP_KERNEL); - if (!field->name) - goto err; - - field->type = kstrdup(type, GFP_KERNEL); - if (!field->type) - goto err; + field->name = name; + field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); @@ -94,8 +89,6 @@ static int __trace_define_field(struct list_head *head, const char *type, return 0; err: - if (field) - kfree(field->name); kmem_cache_free(field_cachep, field); return -ENOMEM; @@ -146,8 +139,6 @@ void trace_destroy_fields(struct ftrace_event_call *call) head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); - kfree(field->type); - kfree(field->name); kmem_cache_free(field_cachep, field); } } @@ -286,7 +277,6 @@ static void __put_system(struct event_subsystem *system) kfree(filter->filter_string); kfree(filter); } - kfree(system->name); kfree(system); } @@ -1202,10 +1192,7 @@ create_new_subsystem(const char *name) return NULL; system->ref_count = 1; - system->name = kstrdup(name, GFP_KERNEL); - - if (!system->name) - goto out_free; + system->name = name; system->filter = NULL; @@ -1218,7 +1205,6 @@ create_new_subsystem(const char *name) return system; out_free: - kfree(system->name); kfree(system); return NULL; } -- cgit v1.2.3 From 189e5784f6c5e001a84127b83f03bc76a8bfb1ec Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 28 Feb 2013 20:03:06 -0500 Subject: tracing: Do not block on splice if either file or splice NONBLOCK flag is set Currently only the splice NONBLOCK flag is checked to determine if the splice read should block or not. But the file descriptor NONBLOCK flag also needs to be checked. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ab3df804fa96..598a7aa7d0ae 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4593,7 +4593,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { - if (flags & SPLICE_F_NONBLOCK) + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) ret = -EAGAIN; else ret = 0; -- cgit v1.2.3 From cc60cdc952be09bca5b0bff9fefc7aa6185c3049 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Thu, 28 Feb 2013 09:17:16 -0500 Subject: tracing: Fix polling on trace_pipe_raw The trace_pipe_raw never implemented polling and this was casing issues for several utilities. This is now implemented. Blocked reads still are on the TODO list. Reported-by: Mauro Carvalho Chehab <mchehab@redhat.com> Tested-by: Mauro Carvalho Chehab <mchehab@redhat.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 78 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 51 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 598a7aa7d0ae..4a6e461273a9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3555,10 +3555,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) } static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { - struct trace_iterator *iter = filp->private_data; - if (trace_flags & TRACE_ITER_BLOCK) { /* * Always select as readable when in blocking mode @@ -3567,6 +3565,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) } else { if (!trace_empty(iter)) return POLLIN | POLLRDNORM; + trace_wakeup_needed = true; poll_wait(filp, &trace_wait, poll_table); if (!trace_empty(iter)) return POLLIN | POLLRDNORM; @@ -3575,6 +3574,14 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table) } } +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + return trace_poll(iter, filp, poll_table); +} + /* * This is a make-shift waitqueue. * A tracer might use this callback on some rare cases: @@ -4362,9 +4369,8 @@ static const struct file_operations snapshot_fops = { #endif /* CONFIG_TRACER_SNAPSHOT */ struct ftrace_buffer_info { - struct trace_array *tr; + struct trace_iterator iter; void *spare; - int cpu; unsigned int read; }; @@ -4381,22 +4387,32 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (!info) return -ENOMEM; - info->tr = tr; - info->cpu = tc->cpu; - info->spare = NULL; + info->iter.tr = tr; + info->iter.cpu_file = tc->cpu; + info->spare = NULL; /* Force reading ring buffer for first read */ - info->read = (unsigned int)-1; + info->read = (unsigned int)-1; filp->private_data = info; return nonseekable_open(inode, filp); } +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + + return trace_poll(iter, filp, poll_table); +} + static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; ssize_t ret; size_t size; @@ -4404,7 +4420,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return 0; if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); + info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file); if (!info->spare) return -ENOMEM; @@ -4412,12 +4428,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (info->read < PAGE_SIZE) goto read; - trace_access_lock(info->cpu); - ret = ring_buffer_read_page(info->tr->buffer, + trace_access_lock(iter->cpu_file); + ret = ring_buffer_read_page(iter->tr->buffer, &info->spare, count, - info->cpu, 0); - trace_access_unlock(info->cpu); + iter->cpu_file, 0); + trace_access_unlock(iter->cpu_file); if (ret < 0) return 0; @@ -4442,9 +4458,10 @@ read: static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; if (info->spare) - ring_buffer_free_read_page(info->tr->buffer, info->spare); + ring_buffer_free_read_page(iter->tr->buffer, info->spare); kfree(info); return 0; @@ -4511,6 +4528,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { @@ -4541,8 +4559,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - trace_access_lock(info->cpu); - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: + trace_access_lock(iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file); for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; @@ -4553,15 +4572,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; ref->ref = 1; - ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); + ref->buffer = iter->tr->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { kfree(ref); break; } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 1); + len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); @@ -4585,20 +4604,24 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.nr_pages++; *ppos += PAGE_SIZE; - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file); } - trace_access_unlock(info->cpu); + trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { - if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; - else - ret = 0; - /* TODO: block */ - goto out; + goto out; + } + default_wait_pipe(iter); + if (signal_pending(current)) { + ret = -EINTR; + goto out; + } + goto again; } ret = splice_to_pipe(pipe, &spd); @@ -4610,6 +4633,7 @@ out: static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, + .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, .llseek = no_llseek, -- cgit v1.2.3 From b627344fef0c38fa4e3050348e168e46db87c905 Mon Sep 17 00:00:00 2001 From: Steven Rostedt <srostedt@redhat.com> Date: Thu, 28 Feb 2013 13:44:11 -0500 Subject: tracing: Fix read blocking on trace_pipe_raw If the ring buffer is empty, a read to trace_pipe_raw wont block. The tracing code has the infrastructure to wake up waiting readers, but the trace_pipe_raw doesn't take advantage of that. When a read is done to trace_pipe_raw without the O_NONBLOCK flag set, have the read block until there's data in the requested buffer. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4a6e461273a9..3ec146c96df4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4389,6 +4389,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) info->iter.tr = tr; info->iter.cpu_file = tc->cpu; + info->iter.trace = tr->current_trace; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; @@ -4428,18 +4429,29 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (info->read < PAGE_SIZE) goto read; + again: trace_access_lock(iter->cpu_file); ret = ring_buffer_read_page(iter->tr->buffer, &info->spare, count, iter->cpu_file, 0); trace_access_unlock(iter->cpu_file); - if (ret < 0) + + if (ret < 0) { + if (trace_empty(iter)) { + if ((filp->f_flags & O_NONBLOCK)) + return -EAGAIN; + iter->trace->wait_pipe(iter); + if (signal_pending(current)) + return -EINTR; + goto again; + } return 0; + } info->read = 0; -read: + read: size = PAGE_SIZE - info->read; if (size > count) size = count; @@ -4616,7 +4628,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ret = -EAGAIN; goto out; } - default_wait_pipe(iter); + iter->trace->wait_pipe(iter); if (signal_pending(current)) { ret = -EINTR; goto out; -- cgit v1.2.3 From 15693458c4bc0693fd63a50d60f35b628fcf4e29 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 28 Feb 2013 19:59:17 -0500 Subject: tracing/ring-buffer: Move poll wake ups into ring buffer code Move the logic to wake up on ring buffer data into the ring buffer code itself. This simplifies the tracing code a lot and also has the added benefit that waiters on one of the instance buffers can be woken only when data is added to that instance instead of data added to any instance. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ring_buffer.h | 6 ++ kernel/trace/ring_buffer.c | 146 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.c | 83 ++++--------------------- 3 files changed, 164 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 1342e69542f3..d69cf637a15a 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -4,6 +4,7 @@ #include <linux/kmemcheck.h> #include <linux/mm.h> #include <linux/seq_file.h> +#include <linux/poll.h> struct ring_buffer; struct ring_buffer_iter; @@ -96,6 +97,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k __ring_buffer_alloc((size), (flags), &__key); \ }) +void ring_buffer_wait(struct ring_buffer *buffer, int cpu); +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table); + + #define RING_BUFFER_ALL_CPUS -1 void ring_buffer_free(struct ring_buffer *buffer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7244acde77b0..56b6ea32d2e7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,6 +8,7 @@ #include <linux/trace_clock.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> +#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> @@ -442,6 +443,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) return ret; } +struct rb_irq_work { + struct irq_work work; + wait_queue_head_t waiters; + bool waiters_pending; +}; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -476,6 +483,8 @@ struct ring_buffer_per_cpu { struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; + + struct rb_irq_work irq_work; }; struct ring_buffer { @@ -495,6 +504,8 @@ struct ring_buffer { struct notifier_block cpu_notify; #endif u64 (*clock)(void); + + struct rb_irq_work irq_work; }; struct ring_buffer_iter { @@ -506,6 +517,118 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ + struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + + wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + DEFINE_WAIT(wait); + struct rb_irq_work *work; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) + schedule(); + + finish_wait(&work->waiters, &wait); +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + work->waiters_pending = true; + poll_wait(filp, &work->waiters, poll_table); + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + return 0; +} + /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ @@ -1061,6 +1184,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); + init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1156,6 +1280,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->clock = trace_clock_local; buffer->reader_lock_key = key; + init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + /* need at least two pages */ if (nr_pages < 2) nr_pages = 2; @@ -2610,6 +2736,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_end_commit(cpu_buffer); } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to @@ -2629,6 +2771,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + trace_recursive_unlock(); preempt_enable_notrace(); @@ -2801,6 +2945,8 @@ int ring_buffer_write(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + ret = 0; out: preempt_enable_notrace(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3ec146c96df4..b5b25b6575a9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -19,7 +19,6 @@ #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/irqflags.h> -#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> @@ -86,14 +85,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) */ static DEFINE_PER_CPU(bool, trace_cmdline_save); -/* - * When a reader is waiting for data, then this variable is - * set to true. - */ -static bool trace_wakeup_needed; - -static struct irq_work trace_work_wakeup; - /* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization @@ -334,28 +325,12 @@ static inline void trace_access_lock_init(void) #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); - /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -static void trace_wake_up(struct irq_work *work) -{ - wake_up_all(&trace_wait); - -} - /** * tracing_on - enable tracing buffers * @@ -763,36 +738,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) static void default_wait_pipe(struct trace_iterator *iter) { - DEFINE_WAIT(wait); - - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); - - /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. - */ - trace_wakeup_needed = true; - - if (trace_empty(iter)) - schedule(); + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return; - finish_wait(&trace_wait, &wait); + ring_buffer_wait(iter->tr->buffer, iter->cpu_file); } /** @@ -1262,11 +1212,6 @@ void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_cmdline_save, true); - if (trace_wakeup_needed) { - trace_wakeup_needed = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&trace_work_wakeup); - } ring_buffer_unlock_commit(buffer, event); } @@ -3557,21 +3502,18 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) static unsigned int trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { - if (trace_flags & TRACE_ITER_BLOCK) { + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return POLLIN | POLLRDNORM; + + if (trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return POLLIN | POLLRDNORM; - } else { - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - trace_wakeup_needed = true; - poll_wait(filp, &trace_wait, poll_table); - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - - return 0; - } + else + return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file, + filp, poll_table); } static unsigned int @@ -5701,7 +5643,6 @@ __init static int tracer_alloc_buffers(void) #endif trace_init_cmdlines(); - init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); -- cgit v1.2.3 From f71130de5c7fba92faf3901784714e37a234c08f Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Thu, 21 Feb 2013 10:32:38 +0800 Subject: tracing: Add a helper function for event print functions Move duplicate code in event print functions to a helper function. This shrinks the size of the kernel by ~13K. text data bss dec hex filename 6596137 1743966 10138672 18478775 119f6b7 vmlinux.o.old 6583002 1743849 10138672 18465523 119c2f3 vmlinux.o.new Link: http://lkml.kernel.org/r/51258746.2060304@huawei.com Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace_event.h | 8 ++++++-- include/trace/ftrace.h | 23 ++++++----------------- kernel/trace/trace_output.c | 26 ++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index fd28c170c597..4d79d2dc189c 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -38,6 +38,12 @@ const char *ftrace_print_symbols_seq_u64(struct trace_seq *p, const char *ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int len); +struct trace_iterator; +struct trace_event; + +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *event); + /* * The trace entry - the most basic unit of tracing. This is what * is printed in the end as a single line in the trace output, such as: @@ -95,8 +101,6 @@ enum trace_iter_flags { }; -struct trace_event; - typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags, struct trace_event *event); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index e5d140a91fd7..17a77fcac2a2 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -227,29 +227,18 @@ static notrace enum print_line_t \ ftrace_raw_output_##call(struct trace_iterator *iter, int flags, \ struct trace_event *trace_event) \ { \ - struct ftrace_event_call *event; \ struct trace_seq *s = &iter->seq; \ + struct trace_seq __maybe_unused *p = &iter->tmp_seq; \ struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - struct trace_seq *p = &iter->tmp_seq; \ int ret; \ \ - event = container_of(trace_event, struct ftrace_event_call, \ - event); \ - \ - entry = iter->ent; \ + field = (typeof(field))iter->ent; \ \ - if (entry->type != event->event.type) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - trace_seq_init(p); \ - ret = trace_seq_printf(s, "%s: ", event->name); \ + ret = ftrace_raw_output_prep(iter, trace_event); \ if (ret) \ - ret = trace_seq_printf(s, print); \ + return ret; \ + \ + ret = trace_seq_printf(s, print); \ if (!ret) \ return TRACE_TYPE_PARTIAL_LINE; \ \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 194d79602dc7..aa92ac322ba2 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -397,6 +397,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) +{ + struct ftrace_event_call *event; + struct trace_seq *s = &iter->seq; + struct trace_seq *p = &iter->tmp_seq; + struct trace_entry *entry; + int ret; + + event = container_of(trace_event, struct ftrace_event_call, event); + entry = iter->ent; + + if (entry->type != event->event.type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_init(p); + ret = trace_seq_printf(s, "%s: ", event->name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { -- cgit v1.2.3 From 7e4f44b153e1ec07bb64c1c1671cdf492465bbf3 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Thu, 21 Feb 2013 10:33:33 +0800 Subject: tracing: Annotate event field-defining functions with __init Those functions are called either during kernel boot or module init. Before: $ dmesg | grep 'Freeing unused kernel memory' Freeing unused kernel memory: 1208k freed Freeing unused kernel memory: 1360k freed Freeing unused kernel memory: 1960k freed After: $ dmesg | grep 'Freeing unused kernel memory' Freeing unused kernel memory: 1236k freed Freeing unused kernel memory: 1388k freed Freeing unused kernel memory: 1960k freed Link: http://lkml.kernel.org/r/5125877D.5000201@huawei.com Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/trace/ftrace.h | 2 +- kernel/trace/trace_export.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 17a77fcac2a2..a536f66f84c6 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -324,7 +324,7 @@ static struct trace_event_functions ftrace_event_type_funcs_##call = { \ #undef DECLARE_EVENT_CLASS #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, func, print) \ -static int notrace \ +static int notrace __init \ ftrace_define_fields_##call(struct ftrace_event_call *event_call) \ { \ struct ftrace_raw_##call field; \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037d..4f6a91c1370c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ -int \ +static int __init \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ -- cgit v1.2.3 From b8aae39fc54a2e297698288ac48237cc4c6f83bb Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Thu, 21 Feb 2013 10:33:58 +0800 Subject: tracing/syscalls: Annotate field-defining functions with __init These two functions are called during kernel boot only. Link: http://lkml.kernel.org/r/51258796.7020704@huawei.com Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_syscalls.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a842783ad6be..00b5c3e6fbbe 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -261,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -284,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call) { struct syscall_trace_exit trace; int ret; -- cgit v1.2.3 From 34ef61b1fa6172e994e441f1f0241dc53a75bd5f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Sat, 2 Mar 2013 16:49:10 -0500 Subject: tracing: Add __per_cpu annotation to trace array percpu data pointer With the conversion of the data array to per cpu, sparse now complains about the use of per_cpu_ptr() on the variable. But The variable is allocated with alloc_percpu() and is fine to use. But since the structure that contains the data variable does not annotate it as such, sparse gives out a lot of false warnings. Reported-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e420f2a230de..6728a249e817 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -198,7 +198,7 @@ struct trace_array { struct list_head systems; struct list_head events; struct task_struct *waiter; - struct trace_array_cpu *data; + struct trace_array_cpu __percpu *data; }; enum { -- cgit v1.2.3 From 315326c16ad08771fe0f075a08a18c99976f29f5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Sat, 2 Mar 2013 17:37:14 -0500 Subject: tracing: Fix trace events build without modules The new multi-buffers added a descriptor that kept track of module events, and the directories they use, with struct ftace_module_file_ops. This is used to add a ref count to keep modules from unloading while their files are being accessed. As the descriptor is only needed when CONFIG_MODULES is enabled, it is only declared when the config is enabled. But that struct is dereferenced in a few areas outside the #ifdef CONFIG_MODULES. By adding some helper routines and moving code around a little, events can be compiled again without modules. Reported-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_events.c | 55 +++++++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 63b4bdf84593..0f1307a29fcf 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1546,9 +1546,18 @@ struct ftrace_module_file_ops { struct file_operations filter; }; -static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod) +static struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) { - struct ftrace_module_file_ops *file_ops; + /* + * As event_calls are added in groups by module, + * when we find one file_ops, we don't need to search for + * each call in that module, as the rest should be the + * same. Only search for a new one if the last one did + * not match. + */ + if (file_ops && mod == file_ops->mod) + return file_ops; list_for_each_entry(file_ops, &ftrace_module_file_list, list) { if (file_ops->mod == mod) @@ -1664,16 +1673,35 @@ static int trace_module_notify(struct notifier_block *self, return 0; } + +static int +__trace_add_new_mod_event(struct ftrace_event_call *call, + struct trace_array *tr, + struct ftrace_module_file_ops *file_ops) +{ + return __trace_add_new_event(call, tr, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); +} + #else -static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod) +static inline struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) { return NULL; } -static int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static inline int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { return 0; } +static inline int +__trace_add_new_mod_event(struct ftrace_event_call *call, + struct trace_array *tr, + struct ftrace_module_file_ops *file_ops) +{ + return -ENODEV; +} #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ @@ -1692,20 +1720,11 @@ __trace_add_event_dirs(struct trace_array *tr) * want the module to disappear when reading one * of these files). The file_ops keep account of * the module ref count. - * - * As event_calls are added in groups by module, - * when we find one file_ops, we don't need to search for - * each call in that module, as the rest should be the - * same. Only search for a new one if the last one did - * not match. */ - if (!file_ops || call->mod != file_ops->mod) - file_ops = find_ftrace_file_ops(call->mod); + file_ops = find_ftrace_file_ops(file_ops, call->mod); if (!file_ops) continue; /* Warn? */ - ret = __trace_add_new_event(call, tr, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + ret = __trace_add_new_mod_event(call, tr, file_ops); if (ret < 0) pr_warning("Could not create directory for event %s\n", call->name); @@ -1794,9 +1813,7 @@ __add_event_to_tracers(struct ftrace_event_call *call, list_for_each_entry(tr, &ftrace_trace_arrays, list) { if (file_ops) - __trace_add_new_event(call, tr, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + __trace_add_new_mod_event(call, tr, file_ops); else __trace_add_new_event(call, tr, &ftrace_event_id_fops, -- cgit v1.2.3 From 523c81135bb23b2d9a8c21365d90d21b1309c138 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Mon, 4 Mar 2013 14:15:59 +0800 Subject: tracing: Fix some section mismatch warnings As we've added __init annotation to field-defining functions, we should add __refdata annotation to event_call variables, which reference those functions. Link: http://lkml.kernel.org/r/51343C1F.2050502@huawei.com Reported-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/trace/ftrace.h | 2 +- kernel/trace/trace_export.c | 2 +- kernel/trace/trace_syscalls.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a536f66f84c6..bbf09c2021b9 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -572,7 +572,7 @@ static inline void ftrace_test_probe_##call(void) \ #define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ _TRACE_PERF_PROTO(call, PARAMS(proto)); \ static const char print_fmt_##call[] = print; \ -static struct ftrace_event_class __used event_class_##call = { \ +static struct ftrace_event_class __used __refdata event_class_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_##call.fields),\ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4f6a91c1370c..d21a74670088 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct ftrace_event_class event_class_ftrace_##call = { \ +struct ftrace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 00b5c3e6fbbe..1cd37ffb4093 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -479,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = { .trace = print_syscall_exit, }; -struct ftrace_event_class event_class_syscall_enter = { +struct ftrace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, .define_fields = syscall_enter_define_fields, @@ -487,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = { .raw_init = init_syscall_trace, }; -struct ftrace_event_class event_class_syscall_exit = { +struct ftrace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, .define_fields = syscall_exit_define_fields, -- cgit v1.2.3 From f1dc6725882b5ca54eb9a04436a3b47d58f2cbc7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Mon, 4 Mar 2013 17:33:05 -0500 Subject: ring-buffer: Init waitqueue for blocked readers The move of blocked readers to the ring buffer left out the init of the wait queue that is used. Tests missed this due to running stress tests against the buffers, which didn't allow for any readers to end up waiting. Running a simple read and wait triggered a bug. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ring_buffer.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 56b6ea32d2e7..65fe2a4f9824 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1185,6 +1185,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&cpu_buffer->irq_work.waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1281,6 +1282,7 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->reader_lock_key = key; init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&buffer->irq_work.waiters); /* need at least two pages */ if (nr_pages < 2) -- cgit v1.2.3 From 575380da8b46969a2c6a7e14a51056a63b30fe2e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Mon, 4 Mar 2013 23:05:12 -0500 Subject: tracing: Only clear trace buffer on module unload if event was traced Currently, when a module with events is unloaded, the trace buffer is cleared. This is just a safety net in case the module might have some strange callback when its event is outputted. But there's no reason to reset the buffer if the module didn't have any of its events traced. Add a flag to the event "call" structure called WAS_ENABLED and gets set when the event is ever enabled, and this flag never gets cleared. When a module gets unloaded, if any of its events have this flag set, then the trace buffer will get cleared. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace_event.h | 5 +++++ kernel/trace/trace_events.c | 12 ++++++++---- 2 files changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 0b0814d90164..d6964244e567 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -197,6 +197,7 @@ enum { TRACE_EVENT_FL_CAP_ANY_BIT, TRACE_EVENT_FL_NO_SET_FILTER_BIT, TRACE_EVENT_FL_IGNORE_ENABLE_BIT, + TRACE_EVENT_FL_WAS_ENABLED_BIT, }; /* @@ -205,12 +206,16 @@ enum { * CAP_ANY - Any user can enable for perf * NO_SET_FILTER - Set when filter has error and is to be ignored * IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file + * WAS_ENABLED - Set and stays set when an event was ever enabled + * (used for module unloading, if a module event is enabled, + * it is best to clear the buffers that used it). */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), TRACE_EVENT_FL_IGNORE_ENABLE = (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT), + TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), }; struct ftrace_event_call { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0f1307a29fcf..9a7dc4bf1171 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -245,6 +245,9 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file, break; } file->flags |= FTRACE_EVENT_FL_ENABLED; + + /* WAS_ENABLED gets set but never cleared. */ + call->flags |= TRACE_EVENT_FL_WAS_ENABLED; } break; } @@ -1626,12 +1629,13 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; - bool found = false; + bool clear_trace = false; down_write(&trace_event_mutex); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { - found = true; + if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) + clear_trace = true; __trace_remove_event_call(call); } } @@ -1648,9 +1652,9 @@ static void trace_module_remove_events(struct module *mod) /* * It is safest to reset the ring buffer if the module being unloaded - * registered any events. + * registered any events that were used. */ - if (found) + if (clear_trace) tracing_reset_current_online_cpus(); up_write(&trace_event_mutex); } -- cgit v1.2.3 From 873c642f5964b260480850040dec21e42d0ae4e4 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Mon, 4 Mar 2013 23:26:06 -0500 Subject: tracing: Clear all trace buffers when unloaded module event was used Currently we do not know what buffer a module event was enabled in. On unload, it is safest to clear all buffer instances, not just the top level buffer. Todo: Clear only the buffer that the event was used in. The infrastructure is there to do this, but it makes the code a bit more complex. Lets get the current code vetted before we add that. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 10 ++++++++-- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 10 +++++++--- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b5b25b6575a9..c8a852a55db4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -912,9 +912,15 @@ void tracing_reset_current(int cpu) tracing_reset(&global_trace, cpu); } -void tracing_reset_current_online_cpus(void) +void tracing_reset_all_online_cpus(void) { - tracing_reset_online_cpus(&global_trace); + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tracing_reset_online_cpus(tr); + } + mutex_unlock(&trace_types_lock); } #define SAVED_CMDLINES 128 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6728a249e817..fa60b2977524 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -496,7 +496,7 @@ int tracing_is_enabled(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, umode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9a7dc4bf1171..a376ab5eec5c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1649,14 +1649,18 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + up_write(&trace_event_mutex); /* * It is safest to reset the ring buffer if the module being unloaded - * registered any events that were used. + * registered any events that were used. The only worry is if + * a new module gets loaded, and takes on the same id as the events + * of this module. When printing out the buffer, traced events left + * over from this module may be passed to the new module events and + * unexpected results may occur. */ if (clear_trace) - tracing_reset_current_online_cpus(); - up_write(&trace_event_mutex); + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, -- cgit v1.2.3 From 22cffc2bb4a50d8c56f03c56f9f19dea85b78e30 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 07:30:24 -0500 Subject: tracing: Enable snapshot when any latency tracer is enabled The snapshot utility is extremely useful, and does not add any more overhead in memory when another latency tracer is enabled. They use the snapshot underneath. There's no reason to hide the snapshot file when a latency tracer has been enabled in the kernel. If any of the latency tracers (irq, preempt or wakeup) is enabled then also select the snapshot facility. Note, snapshot can be enabled without the latency tracers enabled. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index b516a8e19d51..590a27fc212f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -191,6 +191,7 @@ config IRQSOFF_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -213,6 +214,7 @@ config PREEMPT_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -232,6 +234,7 @@ config SCHED_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE + select TRACER_SNAPSHOT help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -- cgit v1.2.3 From 12883efb670c28dff57dcd7f4f995a1ffe153b2d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 09:24:35 -0500 Subject: tracing: Consolidate max_tr into main trace_array structure Currently, the way the latency tracers and snapshot feature works is to have a separate trace_array called "max_tr" that holds the snapshot buffer. For latency tracers, this snapshot buffer is used to swap the running buffer with this buffer to save the current max latency. The only items needed for the max_tr is really just a copy of the buffer itself, the per_cpu data pointers, the time_start timestamp that states when the max latency was triggered, and the cpu that the max latency was triggered on. All other fields in trace_array are unused by the max_tr, making the max_tr mostly bloat. This change removes the max_tr completely, and adds a new structure called trace_buffer, that holds the buffer pointer, the per_cpu data pointers, the time_start timestamp, and the cpu where the latency occurred. The trace_array, now has two trace_buffers, one for the normal trace and one for the max trace or snapshot. By doing this, not only do we remove the bloat from the max_trace but the instances of traces can now use their own snapshot feature and not have just the top level global_trace have the snapshot feature and latency tracers for itself. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace_event.h | 2 + kernel/trace/blktrace.c | 4 +- kernel/trace/trace.c | 486 +++++++++++++++++++---------------- kernel/trace/trace.h | 37 ++- kernel/trace/trace_functions.c | 8 +- kernel/trace/trace_functions_graph.c | 12 +- kernel/trace/trace_irqsoff.c | 10 +- kernel/trace/trace_kdb.c | 8 +- kernel/trace/trace_mmiotrace.c | 12 +- kernel/trace/trace_output.c | 2 +- kernel/trace/trace_sched_switch.c | 8 +- kernel/trace/trace_sched_wakeup.c | 16 +- kernel/trace/trace_selftest.c | 42 +-- kernel/trace/trace_syscalls.c | 4 +- 14 files changed, 365 insertions(+), 286 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d6964244e567..d84c4a575514 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -8,6 +8,7 @@ #include <linux/perf_event.h> struct trace_array; +struct trace_buffer; struct tracer; struct dentry; @@ -67,6 +68,7 @@ struct trace_entry { struct trace_iterator { struct trace_array *tr; struct tracer *trace; + struct trace_buffer *trace_buffer; void *private; int cpu_file; struct mutex mutex; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 71259e2b6b61..90a55054744c 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, @@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c8a852a55db4..a08c127db865 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -195,27 +195,15 @@ cycle_t ftrace_now(int cpu) u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.buffer) + if (!global_trace.trace_buffer.buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); + ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); return ts; } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array max_tr; - int tracing_is_enabled(void) { return tracing_is_on(); @@ -339,8 +327,8 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | */ void tracing_on(void) { - if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + ring_buffer_record_on(global_trace.trace_buffer.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -361,8 +349,8 @@ EXPORT_SYMBOL_GPL(tracing_on); */ void tracing_off(void) { - if (global_trace.buffer) - ring_buffer_record_off(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + ring_buffer_record_off(global_trace.trace_buffer.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -378,8 +366,8 @@ EXPORT_SYMBOL_GPL(tracing_off); */ int tracing_is_on(void) { - if (global_trace.buffer) - return ring_buffer_record_is_on(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); return !global_trace.buffer_disabled; } EXPORT_SYMBOL_GPL(tracing_is_on); @@ -637,13 +625,14 @@ unsigned long __read_mostly tracing_max_latency; static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu); - struct trace_array_cpu *max_data; + struct trace_buffer *trace_buf = &tr->trace_buffer; + struct trace_buffer *max_buf = &tr->max_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; - max_data = per_cpu_ptr(max_tr.data, cpu); max_data->saved_latency = tracing_max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -686,9 +675,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); - buf = tr->buffer; - tr->buffer = max_tr.buffer; - max_tr.buffer = buf; + buf = tr->trace_buffer.buffer; + tr->trace_buffer.buffer = tr->max_buffer.buffer; + tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&ftrace_max_lock); @@ -716,7 +705,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) arch_spin_lock(&ftrace_max_lock); - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -725,7 +714,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * the max trace buffer (no one writes directly to it) * and flag that it failed. */ - trace_array_printk(&max_tr, _THIS_IP_, + trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit in progress\n"); } @@ -742,7 +731,7 @@ static void default_wait_pipe(struct trace_iterator *iter) if (trace_buffer_iter(iter, iter->cpu_file)) return; - ring_buffer_wait(iter->tr->buffer, iter->cpu_file); + ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); } /** @@ -803,17 +792,19 @@ int register_tracer(struct tracer *type) * internal tracing to verify that everything is in order. * If we fail, we do not register this tracer. */ - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); tr->current_trace = type; +#ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { /* If we expanded the buffers, make sure the max is expanded too */ if (ring_buffer_expanded) - ring_buffer_resize(max_tr.buffer, trace_buf_size, + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); type->allocated_snapshot = true; } +#endif /* the test is responsible for initializing and enabling */ pr_info("Testing tracer %s: ", type->name); @@ -827,16 +818,18 @@ int register_tracer(struct tracer *type) goto out; } /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { type->allocated_snapshot = false; /* Shrink the max buffer again */ if (ring_buffer_expanded) - ring_buffer_resize(max_tr.buffer, 1, + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); } +#endif printk(KERN_CONT "PASSED\n"); } @@ -870,9 +863,9 @@ int register_tracer(struct tracer *type) return ret; } -void tracing_reset(struct trace_array *tr, int cpu) +void tracing_reset(struct trace_buffer *buf, int cpu) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; if (!buffer) return; @@ -886,9 +879,9 @@ void tracing_reset(struct trace_array *tr, int cpu) ring_buffer_record_enable(buffer); } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; int cpu; if (!buffer) @@ -899,7 +892,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) /* Make sure all commits have finished */ synchronize_sched(); - tr->time_start = ftrace_now(tr->cpu); + buf->time_start = ftrace_now(buf->cpu); for_each_online_cpu(cpu) ring_buffer_reset_cpu(buffer, cpu); @@ -909,7 +902,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) void tracing_reset_current(int cpu) { - tracing_reset(&global_trace, cpu); + tracing_reset(&global_trace.trace_buffer, cpu); } void tracing_reset_all_online_cpus(void) @@ -918,7 +911,10 @@ void tracing_reset_all_online_cpus(void) mutex_lock(&trace_types_lock); list_for_each_entry(tr, &ftrace_trace_arrays, list) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE + tracing_reset_online_cpus(&tr->max_buffer); +#endif } mutex_unlock(&trace_types_lock); } @@ -988,13 +984,15 @@ void tracing_start(void) /* Prevent the buffers from switching */ arch_spin_lock(&ftrace_max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); +#endif arch_spin_unlock(&ftrace_max_lock); @@ -1026,7 +1024,7 @@ static void tracing_start_tr(struct trace_array *tr) goto out; } - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); @@ -1053,13 +1051,15 @@ void tracing_stop(void) /* Prevent the buffers from switching */ arch_spin_lock(&ftrace_max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); +#endif arch_spin_unlock(&ftrace_max_lock); @@ -1080,7 +1080,7 @@ static void tracing_stop_tr(struct trace_array *tr) if (tr->stop_count++) goto out; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -1246,7 +1246,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, int type, unsigned long len, unsigned long flags, int pc) { - *current_rb = ftrace_file->tr->buffer; + *current_rb = ftrace_file->tr->trace_buffer.buffer; return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } @@ -1257,7 +1257,7 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, int type, unsigned long len, unsigned long flags, int pc) { - *current_rb = global_trace.buffer; + *current_rb = global_trace.trace_buffer.buffer; return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } @@ -1296,7 +1296,7 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -1437,7 +1437,7 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); } /** @@ -1453,7 +1453,8 @@ void trace_dump_stack(void) local_save_flags(flags); /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); + __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, 3, + preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); @@ -1623,7 +1624,7 @@ void trace_printk_init_buffers(void) * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ - if (global_trace.buffer) + if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } @@ -1683,7 +1684,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) @@ -1706,27 +1707,12 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} - -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - struct ring_buffer *buffer; int len = 0, size, pc; struct print_entry *entry; unsigned long flags; @@ -1754,7 +1740,6 @@ int trace_array_vprintk(struct trace_array *tr, local_save_flags(flags); size = sizeof(*entry) + len + 1; - buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, flags, pc); if (!event) @@ -1775,6 +1760,42 @@ int trace_array_vprintk(struct trace_array *tr, return len; } +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1800,7 +1821,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, lost_events); if (event) { @@ -1815,7 +1836,7 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->tr->buffer; + struct ring_buffer *buffer = iter->trace_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; @@ -1892,7 +1913,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } @@ -1925,13 +1946,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct trace_array *tr = iter->tr; struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; - per_cpu_ptr(tr->data, cpu)->skipped_entries = 0; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) @@ -1945,13 +1965,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * by the timestamp being before the start of the buffer. */ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->tr->time_start) + if (ts >= iter->trace_buffer->time_start) break; entries++; ring_buffer_read(buf_iter, NULL); } - per_cpu_ptr(tr->data, cpu)->skipped_entries = entries; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; } /* @@ -1978,8 +1998,10 @@ static void *s_start(struct seq_file *m, loff_t *pos) *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); +#ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); +#endif if (!iter->snapshot) atomic_inc(&trace_record_cmdline_disabled); @@ -2021,17 +2043,21 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; +#ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; +#endif if (!iter->snapshot) atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, + unsigned long *total, unsigned long *entries) { unsigned long count; int cpu; @@ -2040,19 +2066,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); + count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ - if (per_cpu_ptr(tr->data, cpu)->skipped_entries) { - count -= per_cpu_ptr(tr->data, cpu)->skipped_entries; + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total += count; } else *total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries += count; } } @@ -2069,27 +2095,27 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# _-----=> irqs-off\n"); seq_puts(m, "# / _----=> need-resched\n"); seq_puts(m, "# | / _---=> hardirq/softirq\n"); @@ -2103,8 +2129,8 @@ void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = per_cpu_ptr(tr->data, tr->cpu); + struct trace_buffer *buf = iter->trace_buffer; + struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); struct tracer *type = iter->trace; unsigned long entries; unsigned long total; @@ -2112,7 +2138,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) name = type->name; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2123,7 +2149,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) nsecs_to_usecs(data->saved_latency), entries, total, - tr->cpu, + buf->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -2174,7 +2200,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; - if (per_cpu_ptr(iter->tr->data, iter->cpu)->skipped_entries) + if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; cpumask_set_cpu(iter->cpu, iter->started); @@ -2304,7 +2330,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } return 1; @@ -2316,7 +2342,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } } @@ -2394,9 +2420,9 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->tr, m); + print_func_help_header_irq(iter->trace_buffer, m); else - print_func_help_header(iter->tr, m); + print_func_help_header(iter->trace_buffer, m); } } } @@ -2515,11 +2541,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; + iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE /* Currently only the top directory has a snapshot */ if (tr->current_trace->print_max || snapshot) - iter->tr = &max_tr; + iter->trace_buffer = &tr->max_buffer; else - iter->tr = tr; +#endif + iter->trace_buffer = &tr->trace_buffer; iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); @@ -2530,7 +2560,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->tr->buffer)) + if (ring_buffer_overruns(iter->trace_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -2544,7 +2574,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -2554,7 +2584,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); @@ -2593,12 +2623,7 @@ static int tracing_release(struct inode *inode, struct file *file) return 0; iter = m->private; - - /* Only the global tracer has a matching max_tr */ - if (iter->tr == &max_tr) - tr = &global_trace; - else - tr = iter->tr; + tr = iter->tr; mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { @@ -2634,9 +2659,9 @@ static int tracing_open(struct inode *inode, struct file *file) struct trace_array *tr = tc->tr; if (tc->cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); else - tracing_reset(tr, tc->cpu); + tracing_reset(&tr->trace_buffer, tc->cpu); } if (file->f_mode & FMODE_READ) { @@ -2805,13 +2830,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, */ if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&per_cpu_ptr(tr->data, cpu)->disabled); - ring_buffer_record_disable_cpu(tr->buffer, cpu); + atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&per_cpu_ptr(tr->data, cpu)->disabled); - ring_buffer_record_enable_cpu(tr->buffer, cpu); + atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); @@ -2930,9 +2955,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) trace_event_enable_cmd_record(enabled); if (mask == TRACE_ITER_OVERWRITE) { - ring_buffer_change_overwrite(global_trace.buffer, enabled); + ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE - ring_buffer_change_overwrite(max_tr.buffer, enabled); + ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); #endif } @@ -3116,42 +3141,44 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); return t->init(tr); } -static void set_buffer_entries(struct trace_array *tr, unsigned long val) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) { int cpu; for_each_tracing_cpu(cpu) - per_cpu_ptr(tr->data, cpu)->entries = val; + per_cpu_ptr(buf->data, cpu)->entries = val; } +#ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_array *tr, - struct trace_array *size_tr, int cpu_id) +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id) { int cpu, ret = 0; if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(tr->buffer, - per_cpu_ptr(size_tr->data, cpu)->entries, cpu); + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); if (ret < 0) break; - per_cpu_ptr(tr->data, cpu)->entries = - per_cpu_ptr(size_tr->data, cpu)->entries; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; } } else { - ret = ring_buffer_resize(tr->buffer, - per_cpu_ptr(size_tr->data, cpu_id)->entries, cpu_id); + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); if (ret == 0) - per_cpu_ptr(tr->data, cpu_id)->entries = - per_cpu_ptr(size_tr->data, cpu_id)->entries; + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; } return ret; } +#endif /* CONFIG_TRACER_MAX_TRACE */ static int __tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu) @@ -3166,20 +3193,22 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, ring_buffer_expanded = 1; /* May be called before buffers are initialized */ - if (!tr->buffer) + if (!tr->trace_buffer.buffer) return 0; - ret = ring_buffer_resize(tr->buffer, size, cpu); + ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); if (ret < 0) return ret; +#ifdef CONFIG_TRACER_MAX_TRACE if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || !tr->current_trace->use_max_tr) goto out; - ret = ring_buffer_resize(max_tr.buffer, size, cpu); + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { - int r = resize_buffer_duplicate_size(tr, tr, cpu); + int r = resize_buffer_duplicate_size(&tr->trace_buffer, + &tr->trace_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3202,15 +3231,17 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, } if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&max_tr, size); + set_buffer_entries(&tr->max_buffer, size); else - per_cpu_ptr(max_tr.data, cpu)->entries = size; + per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; out: +#endif /* CONFIG_TRACER_MAX_TRACE */ + if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(tr, size); + set_buffer_entries(&tr->trace_buffer, size); else - per_cpu_ptr(tr->data, cpu)->entries = size; + per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; return ret; } @@ -3277,7 +3308,9 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; +#endif int ret = 0; mutex_lock(&trace_types_lock); @@ -3308,7 +3341,10 @@ static int tracing_set_tracer(const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); +#ifdef CONFIG_TRACER_MAX_TRACE had_max_tr = tr->current_trace->allocated_snapshot; + + /* Current trace needs to be nop_trace before synchronize_sched */ tr->current_trace = &nop_trace; if (had_max_tr && !t->use_max_tr) { @@ -3325,22 +3361,28 @@ static int tracing_set_tracer(const char *buf) * The max_tr ring buffer has some state (e.g. ring->clock) and * we want preserve it. */ - ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&max_tr, 1); - tracing_reset_online_cpus(&max_tr); + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); tr->current_trace->allocated_snapshot = false; } +#else + tr->current_trace = &nop_trace; +#endif destroy_trace_option_files(topts); topts = create_trace_option_files(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { /* we need to make per cpu buffer sizes equivalent */ - ret = resize_buffer_duplicate_size(&max_tr, &global_trace, + ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->trace_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; t->allocated_snapshot = true; } +#endif if (t->init) { ret = tracer_init(t, tr); @@ -3468,6 +3510,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) iter->cpu_file = tc->cpu; iter->tr = tc->tr; + iter->trace_buffer = &tc->tr->trace_buffer; mutex_init(&iter->mutex); filp->private_data = iter; @@ -3518,7 +3561,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl */ return POLLIN | POLLRDNORM; else - return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file, + return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, filp, poll_table); } @@ -3857,8 +3900,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf, for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) - size = per_cpu_ptr(tr->data, cpu)->entries; - if (size != per_cpu_ptr(tr->data, cpu)->entries) { + size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { buf_size_same = 0; break; } @@ -3874,7 +3917,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->data, tc->cpu)->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -3921,7 +3964,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += per_cpu_ptr(tr->data, cpu)->entries >> 10; + size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -4026,7 +4069,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { @@ -4111,16 +4154,19 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, tr->clock_id = i; - ring_buffer_set_clock(tr->buffer, trace_clocks[i].func); - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&global_trace); - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&global_trace.trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&global_trace.max_buffer); +#endif mutex_unlock(&trace_types_lock); @@ -4160,6 +4206,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) return -ENOMEM; } iter->tr = tc->tr; + iter->trace_buffer = &tc->tr->max_buffer; m->private = iter; file->private_data = m; } @@ -4196,18 +4243,18 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: if (tr->current_trace->allocated_snapshot) { /* free spare buffer */ - ring_buffer_resize(max_tr.buffer, 1, + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&max_tr, 1); - tracing_reset_online_cpus(&max_tr); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); tr->current_trace->allocated_snapshot = false; } break; case 1: if (!tr->current_trace->allocated_snapshot) { /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&max_tr, - &global_trace, RING_BUFFER_ALL_CPUS); + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) break; tr->current_trace->allocated_snapshot = true; @@ -4220,7 +4267,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; default: if (tr->current_trace->allocated_snapshot) - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&tr->max_buffer); break; } @@ -4338,6 +4385,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) info->iter.tr = tr; info->iter.cpu_file = tc->cpu; info->iter.trace = tr->current_trace; + info->iter.trace_buffer = &tr->trace_buffer; info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; @@ -4369,7 +4417,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return 0; if (!info->spare) - info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file); + info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + iter->cpu_file); if (!info->spare) return -ENOMEM; @@ -4379,7 +4428,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, again: trace_access_lock(iter->cpu_file); - ret = ring_buffer_read_page(iter->tr->buffer, + ret = ring_buffer_read_page(iter->trace_buffer->buffer, &info->spare, count, iter->cpu_file, 0); @@ -4421,7 +4470,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct trace_iterator *iter = &info->iter; if (info->spare) - ring_buffer_free_read_page(iter->tr->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); kfree(info); return 0; @@ -4521,7 +4570,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, again: trace_access_lock(iter->cpu_file); - entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; @@ -4532,7 +4581,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; ref->ref = 1; - ref->buffer = iter->tr->buffer; + ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { kfree(ref); @@ -4564,7 +4613,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.nr_pages++; *ppos += PAGE_SIZE; - entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); } trace_access_unlock(iter->cpu_file); @@ -4605,6 +4654,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf, { struct trace_cpu *tc = filp->private_data; struct trace_array *tr = tc->tr; + struct trace_buffer *trace_buf = &tr->trace_buffer; struct trace_seq *s; unsigned long cnt; unsigned long long t; @@ -4617,41 +4667,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_init(s); - cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); - cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); - cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); if (trace_clocks[trace_clock_id].in_ns) { /* local or global for trace_clock */ - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { /* counter or tsc mode for trace_clock */ trace_seq_printf(s, "oldest event ts: %llu\n", - ring_buffer_oldest_event_ts(tr->buffer, cpu)); + ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(tr->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer, cpu)); } - cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); + cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); - cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4754,7 +4804,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) static void tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu); + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -5038,7 +5088,7 @@ rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; char buf[64]; int r; @@ -5057,7 +5107,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; unsigned long val; int ret; @@ -5129,18 +5179,18 @@ static int new_instance_create(const char *name) rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - tr->buffer = ring_buffer_alloc(trace_buf_size, rb_flags); - if (!tr->buffer) + tr->trace_buffer.buffer = ring_buffer_alloc(trace_buf_size, rb_flags); + if (!tr->trace_buffer.buffer) goto out_free_tr; - tr->data = alloc_percpu(struct trace_array_cpu); - if (!tr->data) + tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu); + if (!tr->trace_buffer.data) goto out_free_tr; for_each_tracing_cpu(i) { - memset(per_cpu_ptr(tr->data, i), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(tr->data, i)->trace_cpu.cpu = i; - per_cpu_ptr(tr->data, i)->trace_cpu.tr = tr; + memset(per_cpu_ptr(tr->trace_buffer.data, i), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.cpu = i; + per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.tr = tr; } /* Holder for file callbacks */ @@ -5164,8 +5214,8 @@ static int new_instance_create(const char *name) return 0; out_free_tr: - if (tr->buffer) - ring_buffer_free(tr->buffer); + if (tr->trace_buffer.buffer) + ring_buffer_free(tr->trace_buffer.buffer); kfree(tr->name); kfree(tr); @@ -5198,8 +5248,8 @@ static int instance_delete(const char *name) event_trace_del_tracer(tr); debugfs_remove_recursive(tr->dir); - free_percpu(tr->data); - ring_buffer_free(tr->buffer); + free_percpu(tr->trace_buffer.data); + ring_buffer_free(tr->trace_buffer.buffer); kfree(tr->name); kfree(tr); @@ -5439,6 +5489,7 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->tr = &global_trace; iter->trace = iter->tr->current_trace; iter->cpu_file = RING_BUFFER_ALL_CPUS; + iter->trace_buffer = &global_trace.trace_buffer; } static void @@ -5476,7 +5527,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5544,7 +5595,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) trace_flags |= old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } tracing_on(); } @@ -5594,58 +5645,59 @@ __init static int tracer_alloc_buffers(void) raw_spin_lock_init(&global_trace.start_lock); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); - if (!global_trace.buffer) { + global_trace.trace_buffer.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); + if (!global_trace.trace_buffer.buffer) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); goto out_free_cpumask; } - global_trace.data = alloc_percpu(struct trace_array_cpu); + global_trace.trace_buffer.data = alloc_percpu(struct trace_array_cpu); - if (!global_trace.data) { + if (!global_trace.trace_buffer.data) { printk(KERN_ERR "tracer: failed to allocate percpu memory!\n"); WARN_ON(1); goto out_free_cpumask; } for_each_tracing_cpu(i) { - memset(per_cpu_ptr(global_trace.data, i), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(global_trace.data, i)->trace_cpu.cpu = i; - per_cpu_ptr(global_trace.data, i)->trace_cpu.tr = &global_trace; + memset(per_cpu_ptr(global_trace.trace_buffer.data, i), 0, + sizeof(struct trace_array_cpu)); + per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.cpu = i; + per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.tr = &global_trace; } if (global_trace.buffer_disabled) tracing_off(); #ifdef CONFIG_TRACER_MAX_TRACE - max_tr.data = alloc_percpu(struct trace_array_cpu); - if (!max_tr.data) { + global_trace.max_buffer.data = alloc_percpu(struct trace_array_cpu); + if (!global_trace.max_buffer.data) { printk(KERN_ERR "tracer: failed to allocate percpu memory!\n"); WARN_ON(1); goto out_free_cpumask; } - max_tr.buffer = ring_buffer_alloc(1, rb_flags); - raw_spin_lock_init(&max_tr.start_lock); - if (!max_tr.buffer) { + global_trace.max_buffer.buffer = ring_buffer_alloc(1, rb_flags); + if (!global_trace.max_buffer.buffer) { printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); WARN_ON(1); - ring_buffer_free(global_trace.buffer); + ring_buffer_free(global_trace.trace_buffer.buffer); goto out_free_cpumask; } for_each_tracing_cpu(i) { - memset(per_cpu_ptr(max_tr.data, i), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(max_tr.data, i)->trace_cpu.cpu = i; - per_cpu_ptr(max_tr.data, i)->trace_cpu.tr = &max_tr; + memset(per_cpu_ptr(global_trace.max_buffer.data, i), 0, + sizeof(struct trace_array_cpu)); + per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.cpu = i; + per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.tr = &global_trace; } #endif /* Allocate the first page for all buffers */ - set_buffer_entries(&global_trace, - ring_buffer_size(global_trace.buffer, 0)); + set_buffer_entries(&global_trace.trace_buffer, + ring_buffer_size(global_trace.trace_buffer.buffer, 0)); #ifdef CONFIG_TRACER_MAX_TRACE - set_buffer_entries(&max_tr, 1); + set_buffer_entries(&global_trace.max_buffer, 1); #endif trace_init_cmdlines(); @@ -5682,8 +5734,10 @@ __init static int tracer_alloc_buffers(void) return 0; out_free_cpumask: - free_percpu(global_trace.data); - free_percpu(max_tr.data); + free_percpu(global_trace.trace_buffer.data); +#ifdef CONFIG_TRACER_MAX_TRACE + free_percpu(global_trace.max_buffer.data); +#endif free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fa60b2977524..986834f1f4dd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -167,16 +167,37 @@ struct trace_array_cpu { struct tracer; +struct trace_buffer { + struct trace_array *tr; + struct ring_buffer *buffer; + struct trace_array_cpu __percpu *data; + cycle_t time_start; + int cpu; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: */ struct trace_array { - struct ring_buffer *buffer; struct list_head list; char *name; - int cpu; + struct trace_buffer trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * The max_buffer is used to snapshot the trace when a maximum + * latency is reached, or when the user initiates a snapshot. + * Some tracers will use this to store a maximum trace while + * it continues examining live traces. + * + * The buffers for the max_buffer are set up the same as the trace_buffer + * When a snapshot is taken, the buffer of the max_buffer is swapped + * with the buffer of the trace_buffer and the buffers are reset for + * the trace_buffer so the tracing can continue. + */ + struct trace_buffer max_buffer; +#endif int buffer_disabled; struct trace_cpu trace_cpu; /* place holder */ #ifdef CONFIG_FTRACE_SYSCALLS @@ -189,7 +210,6 @@ struct trace_array { int clock_id; struct tracer *current_trace; unsigned int flags; - cycle_t time_start; raw_spinlock_t start_lock; struct dentry *dir; struct dentry *options; @@ -198,7 +218,6 @@ struct trace_array { struct list_head systems; struct list_head events; struct task_struct *waiter; - struct trace_array_cpu __percpu *data; }; enum { @@ -345,9 +364,11 @@ struct tracer { struct tracer *next; struct tracer_flags *flags; bool print_max; + bool enabled; +#ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; bool allocated_snapshot; - bool enabled; +#endif }; @@ -493,8 +514,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); @@ -674,6 +695,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 9d73861efc6a..e467c0c7bdd5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void); static int function_trace_init(struct trace_array *tr) { func_trace = tr; - tr->cpu = get_cpu(); + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); @@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr) static void function_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } /* Our option */ @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, goto out; cpu = smp_processor_id(); - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, */ local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ca986d61a282..8388bc99f2ee 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 9b52f9cf7a0d..5aa40ab72b57 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -121,7 +121,7 @@ static int func_prolog_dec(struct trace_array *tr, if (!irqs_disabled_flags(*flags)) return 0; - *data = per_cpu_ptr(tr->data, cpu); + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) @@ -175,7 +175,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) per_cpu(tracing_cpu, cpu) = 0; tracing_max_latency = 0; - tracing_reset_online_cpus(irqsoff_trace); + tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); } @@ -380,7 +380,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (per_cpu(tracing_cpu, cpu)) return; - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || atomic_read(&data->disabled)) return; @@ -418,7 +418,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) if (!tracer_enabled) return; - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) @@ -568,7 +568,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); if (start_irqsoff_tracer(tr, is_graph())) printk(KERN_ERR "failed to start irqsoff tracer\n"); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 349f6941e8f2..bd90e1b06088 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags; @@ -46,14 +46,14 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -83,7 +83,7 @@ out: trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } for_each_tracing_cpu(cpu) diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2472f6f76b50..a5e8f4878bfa 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr) overrun_detected = false; prev_overruns = 0; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } static int mmio_trace_init(struct trace_array *tr) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { unsigned long cnt = atomic_xchg(&dropped_count, 0); - unsigned long over = ring_buffer_overruns(iter->tr->buffer); + unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); if (over > prev_overruns) cnt += over - prev_overruns; @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct ftrace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = per_cpu_ptr(tr->data, smp_processor_id()); + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_rw(tr, data, rw); } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct ftrace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) struct trace_array_cpu *data; preempt_disable(); - data = per_cpu_ptr(tr->data, smp_processor_id()); + data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_map(tr, data, map); preempt_enable(); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index aa92ac322ba2..2edc7220d017 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -643,7 +643,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; - unsigned long long abs_ts = iter->ts - iter->tr->time_start; + unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; struct trace_seq *s = &iter->seq; diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 1ffe39abd6fc..4e98e3b257a3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->data, cpu); + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(ctx_trace->data, cpu); + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index f9ceb75a95b7..c16f8cd63c3c 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -89,7 +89,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (cpu != wakeup_current_cpu) goto out_enable; - *data = per_cpu_ptr(tr->data, cpu); + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; @@ -353,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -365,7 +365,7 @@ probe_wakeup_sched_switch(void *ignore, goto out_unlock; /* The task we are waiting for is waking up */ - data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu); + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +387,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -405,7 +405,7 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); local_irq_save(flags); arch_spin_lock(&wakeup_lock); @@ -435,7 +435,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) return; pc = preempt_count(); - disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -458,7 +458,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) local_save_flags(flags); - data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu); + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +472,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c29..8672c40cb153 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry) return 0; } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer_event *event; struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { + while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* @@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) local_irq_save(flags); arch_spin_lock(&ftrace_max_lock); - cnt = ring_buffer_entries(tr->buffer); + cnt = ring_buffer_entries(buf->buffer); /* * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) */ tracing_off(); for_each_possible_cpu(cpu) { - ret = trace_test_buffer_cpu(tr, cpu); + ret = trace_test_buffer_cpu(buf, cpu); if (ret) break; } @@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* we should have nothing in the buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); if (ret) goto out; @@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); tracing_start(); /* we should only have one item */ @@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -737,7 +737,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs */ - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); @@ -760,7 +760,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -815,9 +815,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -877,9 +877,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -943,11 +943,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (ret) goto out; @@ -973,11 +973,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -1084,10 +1084,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); printk("ret = %d\n", ret); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); @@ -1126,7 +1126,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 1cd37ffb4093..68f3f344be65 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, sys_data->enter_event->event.type, size, 0, 0); if (!event) @@ -355,7 +355,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) if (!sys_data) return; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) -- cgit v1.2.3 From f1affcaaa861f27752a769f889bf1486ebd301fe Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 14:35:11 -0500 Subject: tracing: Add snapshot in the per_cpu trace directories Add the snapshot file into the per_cpu tracing directories to allow them to be read for an individual cpu. This also allows to clear an individual cpu from the snapshot buffer. If the kernel allows it (CONFIG_RING_BUFFER_ALLOW_SWAP is set), then echoing in '1' into one of the per_cpu snapshot files will do an individual cpu buffer swap instead of the entire file. Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 56 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a08c127db865..303932688964 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2436,6 +2436,31 @@ static void test_ftrace_alive(struct seq_file *m) } #ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer.\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); + seq_printf(m, "# Must use main snapshot file to allocate.\n"); +#endif + seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { if (iter->trace->allocated_snapshot) @@ -2444,12 +2469,10 @@ static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); seq_printf(m, "# Snapshot commands:\n"); - seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); - seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); - seq_printf(m, "# Takes a snapshot of the main buffer.\n"); - seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); - seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); - seq_printf(m, "# is not a '0' or '1')\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); } #else /* Should never be called */ @@ -4207,6 +4230,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file) } iter->tr = tc->tr; iter->trace_buffer = &tc->tr->max_buffer; + iter->cpu_file = tc->cpu; m->private = iter; file->private_data = m; } @@ -4241,6 +4265,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, switch (val) { case 0: + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } if (tr->current_trace->allocated_snapshot) { /* free spare buffer */ ring_buffer_resize(tr->max_buffer.buffer, 1, @@ -4251,6 +4279,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, } break; case 1: +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } +#endif if (!tr->current_trace->allocated_snapshot) { /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, @@ -4259,15 +4294,21 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; tr->current_trace->allocated_snapshot = true; } - local_irq_disable(); /* Now, we're going to swap */ - update_max_tr(&global_trace, current, smp_processor_id()); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + update_max_tr(&global_trace, current, smp_processor_id()); + else + update_max_tr_single(&global_trace, current, iter->cpu_file); local_irq_enable(); break; default: - if (tr->current_trace->allocated_snapshot) - tracing_reset_online_cpus(&tr->max_buffer); + if (tr->current_trace->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->max_buffer); + else + tracing_reset(&tr->max_buffer, iter->cpu_file); + } break; } @@ -4835,6 +4876,11 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) trace_create_file("buffer_size_kb", 0444, d_cpu, (void *)&data->trace_cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_cpu, + (void *)&data->trace_cpu, &snapshot_fops); +#endif } #ifdef CONFIG_FTRACE_SELFTEST -- cgit v1.2.3 From 0b85ffc293044393623059eda9904a7d5b644e36 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 14:50:23 -0500 Subject: tracing: Add config option to allow snapshot to swap per cpu When the preempt or irq latency tracers are enabled, they require the ring buffer to be able to swap the per cpu sub buffers between two main buffers. This adds a slight overhead to tracing as the trace recording needs to perform some checks to synchronize between recording and swaps that might be happening on other CPUs. The config RING_BUFFER_ALLOW_SWAP is set when a user of the ring buffer needs the "swap cpu" feature, otherwise the extra checks are not implemented and removed from the tracing overhead. The snapshot feature will swap per CPU if the RING_BUFFER_ALLOW_SWAP config is set. But that only gets set by things like OPROFILE and the irqs and preempt latency tracers. This config is added to let the user decide to include this feature with the snapshot agnostic from whether or not another user of the ring buffer sets this config. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 590a27fc212f..f78eab251897 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -192,6 +192,7 @@ config IRQSOFF_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -215,6 +216,7 @@ config PREEMPT_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -266,6 +268,27 @@ config TRACER_SNAPSHOT echo 1 > /sys/kernel/debug/tracing/snapshot cat snapshot +config TRACER_SNAPSHOT_PER_CPU_SWAP + bool "Allow snapshot to swap per CPU" + depends on TRACER_SNAPSHOT + select RING_BUFFER_ALLOW_SWAP + help + Allow doing a snapshot of a single CPU buffer instead of a + full swap (all buffers). If this is set, then the following is + allowed: + + echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + + After which, only the tracing buffer for CPU 2 was swapped with + the main tracing buffer, and the other CPU buffers remain the same. + + When this is enabled, this adds a little more overhead to the + trace recording, as it needs to add some checks to synchronize + recording with swaps. But this does not affect the performance + of the overall system. This is enabled by default when the preempt + or irq latency tracers are enabled, as those need to swap as well + and already adds the overhead (plus a lot more). + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER -- cgit v1.2.3 From 6de58e6269cd0568ca5fbae14423914eff0f7811 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 16:18:16 -0500 Subject: tracing: Add snapshot_raw to extract the raw data from snapshot Add a 'snapshot_raw' per_cpu file that allows tools to read the raw binary data of the snapshot buffer. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 113 +++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 303932688964..9bb0b52cbd32 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4206,6 +4206,12 @@ static int tracing_clock_open(struct inode *inode, struct file *file) return single_open(file, tracing_clock_show, inode->i_private); } +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int read; +}; + #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { @@ -4336,6 +4342,35 @@ static int tracing_snapshot_release(struct inode *inode, struct file *file) return 0; } +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (info->iter.trace->use_max_tr) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.trace_buffer = &info->iter.tr->max_buffer; + + return ret; +} + #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4402,14 +4437,17 @@ static const struct file_operations snapshot_fops = { .llseek = tracing_seek, .release = tracing_snapshot_release, }; -#endif /* CONFIG_TRACER_SNAPSHOT */ -struct ftrace_buffer_info { - struct trace_iterator iter; - void *spare; - unsigned int read; +static const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, }; +#endif /* CONFIG_TRACER_SNAPSHOT */ + static int tracing_buffers_open(struct inode *inode, struct file *filp) { struct trace_cpu *tc = inode->i_private; @@ -4452,16 +4490,26 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, struct ftrace_buffer_info *info = filp->private_data; struct trace_iterator *iter = &info->iter; ssize_t ret; - size_t size; + ssize_t size; if (!count) return 0; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + size = -EBUSY; + goto out_unlock; + } +#endif + if (!info->spare) info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); + size = -ENOMEM; if (!info->spare) - return -ENOMEM; + goto out_unlock; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) @@ -4477,31 +4525,42 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) { if (trace_empty(iter)) { - if ((filp->f_flags & O_NONBLOCK)) - return -EAGAIN; + if ((filp->f_flags & O_NONBLOCK)) { + size = -EAGAIN; + goto out_unlock; + } + mutex_unlock(&trace_types_lock); iter->trace->wait_pipe(iter); - if (signal_pending(current)) - return -EINTR; + mutex_lock(&trace_types_lock); + if (signal_pending(current)) { + size = -EINTR; + goto out_unlock; + } goto again; } - return 0; + size = 0; + goto out_unlock; } info->read = 0; - read: size = PAGE_SIZE - info->read; if (size > count) size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) - return -EFAULT; + if (ret == size) { + size = -EFAULT; + goto out_unlock; + } size -= ret; *ppos += size; info->read += size; + out_unlock: + mutex_unlock(&trace_types_lock); + return size; } @@ -4591,10 +4650,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - size_t ret; + ssize_t ret; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } +#endif + + if (splice_grow_spd(pipe, &spd)) { + ret = -ENOMEM; + goto out; + } if (*ppos & (PAGE_SIZE - 1)) { ret = -EINVAL; @@ -4666,7 +4736,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ret = -EAGAIN; goto out; } + mutex_unlock(&trace_types_lock); iter->trace->wait_pipe(iter); + mutex_lock(&trace_types_lock); if (signal_pending(current)) { ret = -EINTR; goto out; @@ -4677,6 +4749,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, ret = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); out: + mutex_unlock(&trace_types_lock); + return ret; } @@ -4880,6 +4954,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) #ifdef CONFIG_TRACER_SNAPSHOT trace_create_file("snapshot", 0644, d_cpu, (void *)&data->trace_cpu, &snapshot_fops); + + trace_create_file("snapshot_raw", 0444, d_cpu, + (void *)&data->trace_cpu, &snapshot_raw_fops); #endif } -- cgit v1.2.3 From 45ad21ca5530efdca6a19e4a5ac5e7bd6e24f996 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 18:25:02 -0500 Subject: tracing: Have trace_array keep track if snapshot buffer is allocated The snapshot buffer belongs to the trace array not the tracer that is running. The trace array should be the data structure that keeps track of whether or not the snapshot buffer is allocated, not the tracer desciptor. Having the trace array keep track of it makes modifications so much easier. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 32 +++++++++++++++----------------- kernel/trace/trace.h | 2 +- 2 files changed, 16 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9bb0b52cbd32..bcc9460c2d65 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -667,7 +667,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) WARN_ON_ONCE(!irqs_disabled()); - if (!tr->current_trace->allocated_snapshot) { + if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; @@ -700,7 +700,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (WARN_ON_ONCE(!tr->current_trace->allocated_snapshot)) + if (WARN_ON_ONCE(!tr->allocated_snapshot)) return; arch_spin_lock(&ftrace_max_lock); @@ -802,7 +802,7 @@ int register_tracer(struct tracer *type) if (ring_buffer_expanded) ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, RING_BUFFER_ALL_CPUS); - type->allocated_snapshot = true; + tr->allocated_snapshot = true; } #endif @@ -822,7 +822,7 @@ int register_tracer(struct tracer *type) #ifdef CONFIG_TRACER_MAX_TRACE if (type->use_max_tr) { - type->allocated_snapshot = false; + tr->allocated_snapshot = false; /* Shrink the max buffer again */ if (ring_buffer_expanded) @@ -2463,7 +2463,7 @@ static void show_snapshot_percpu_help(struct seq_file *m) static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { - if (iter->trace->allocated_snapshot) + if (iter->tr->allocated_snapshot) seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); else seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); @@ -3364,12 +3364,12 @@ static int tracing_set_tracer(const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->current_trace->allocated_snapshot; - /* Current trace needs to be nop_trace before synchronize_sched */ tr->current_trace = &nop_trace; +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->allocated_snapshot; + if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that @@ -3387,10 +3387,8 @@ static int tracing_set_tracer(const char *buf) ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); - tr->current_trace->allocated_snapshot = false; + tr->allocated_snapshot = false; } -#else - tr->current_trace = &nop_trace; #endif destroy_trace_option_files(topts); @@ -3403,7 +3401,7 @@ static int tracing_set_tracer(const char *buf) RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; - t->allocated_snapshot = true; + tr->allocated_snapshot = true; } #endif @@ -4275,13 +4273,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -EINVAL; break; } - if (tr->current_trace->allocated_snapshot) { + if (tr->allocated_snapshot) { /* free spare buffer */ ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); set_buffer_entries(&tr->max_buffer, 1); tracing_reset_online_cpus(&tr->max_buffer); - tr->current_trace->allocated_snapshot = false; + tr->allocated_snapshot = false; } break; case 1: @@ -4292,13 +4290,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, break; } #endif - if (!tr->current_trace->allocated_snapshot) { + if (!tr->allocated_snapshot) { /* allocate spare buffer */ ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->trace_buffer, RING_BUFFER_ALL_CPUS); if (ret < 0) break; - tr->current_trace->allocated_snapshot = true; + tr->allocated_snapshot = true; } local_irq_disable(); /* Now, we're going to swap */ @@ -4309,7 +4307,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, local_irq_enable(); break; default: - if (tr->current_trace->allocated_snapshot) { + if (tr->allocated_snapshot) { if (iter->cpu_file == RING_BUFFER_ALL_CPUS) tracing_reset_online_cpus(&tr->max_buffer); else diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 986834f1f4dd..1a456c291a07 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -197,6 +197,7 @@ struct trace_array { * the trace_buffer so the tracing can continue. */ struct trace_buffer max_buffer; + bool allocated_snapshot; #endif int buffer_disabled; struct trace_cpu trace_cpu; /* place holder */ @@ -367,7 +368,6 @@ struct tracer { bool enabled; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; - bool allocated_snapshot; #endif }; -- cgit v1.2.3 From 737223fbca3b1c91feb947c7f571b35749b743b6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 21:13:47 -0500 Subject: tracing: Consolidate buffer allocation code There's a bit of duplicate code in creating the trace buffers for the normal trace buffer and the max trace buffer among the instances and the main global_trace. This code can be consolidated and cleaned up a bit making the code cleaner and more readable as well as less duplication. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 130 +++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bcc9460c2d65..57895d476509 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3171,6 +3171,7 @@ int tracer_init(struct tracer *t, struct trace_array *tr) static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) { int cpu; + for_each_tracing_cpu(cpu) per_cpu_ptr(buf->data, cpu)->entries = val; } @@ -5267,12 +5268,70 @@ struct dentry *trace_instance_dir; static void init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); -static int new_instance_create(const char *name) +static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +{ + int cpu; + + for_each_tracing_cpu(cpu) { + memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; + per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; + } +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) { enum ring_buffer_flags rb_flags; + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + tr->trace_buffer.buffer = ring_buffer_alloc(size, rb_flags); + if (!tr->trace_buffer.buffer) + goto out_free; + + tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu); + if (!tr->trace_buffer.data) + goto out_free; + + init_trace_buffers(tr, &tr->trace_buffer); + + /* Allocate the first page for all buffers */ + set_buffer_entries(&tr->trace_buffer, + ring_buffer_size(tr->trace_buffer.buffer, 0)); + +#ifdef CONFIG_TRACER_MAX_TRACE + + tr->max_buffer.buffer = ring_buffer_alloc(1, rb_flags); + if (!tr->max_buffer.buffer) + goto out_free; + + tr->max_buffer.data = alloc_percpu(struct trace_array_cpu); + if (!tr->max_buffer.data) + goto out_free; + + init_trace_buffers(tr, &tr->max_buffer); + + set_buffer_entries(&tr->max_buffer, 1); +#endif + return 0; + + out_free: + if (tr->trace_buffer.buffer) + ring_buffer_free(tr->trace_buffer.buffer); + free_percpu(tr->trace_buffer.data); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->max_buffer.buffer) + ring_buffer_free(tr->max_buffer.buffer); + free_percpu(tr->max_buffer.data); +#endif + return -ENOMEM; +} + +static int new_instance_create(const char *name) +{ struct trace_array *tr; int ret; - int i; mutex_lock(&trace_types_lock); @@ -5298,22 +5357,9 @@ static int new_instance_create(const char *name) INIT_LIST_HEAD(&tr->systems); INIT_LIST_HEAD(&tr->events); - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - - tr->trace_buffer.buffer = ring_buffer_alloc(trace_buf_size, rb_flags); - if (!tr->trace_buffer.buffer) - goto out_free_tr; - - tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu); - if (!tr->trace_buffer.data) + if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - for_each_tracing_cpu(i) { - memset(per_cpu_ptr(tr->trace_buffer.data, i), 0, sizeof(struct trace_array_cpu)); - per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.cpu = i; - per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.tr = tr; - } - /* Holder for file callbacks */ tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; tr->trace_cpu.tr = tr; @@ -5736,8 +5782,6 @@ EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { int ring_buf_size; - enum ring_buffer_flags rb_flags; - int i; int ret = -ENOMEM; @@ -5758,69 +5802,21 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.trace_buffer.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); - if (!global_trace.trace_buffer.buffer) { + if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); goto out_free_cpumask; } - global_trace.trace_buffer.data = alloc_percpu(struct trace_array_cpu); - - if (!global_trace.trace_buffer.data) { - printk(KERN_ERR "tracer: failed to allocate percpu memory!\n"); - WARN_ON(1); - goto out_free_cpumask; - } - - for_each_tracing_cpu(i) { - memset(per_cpu_ptr(global_trace.trace_buffer.data, i), 0, - sizeof(struct trace_array_cpu)); - per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.cpu = i; - per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.tr = &global_trace; - } - if (global_trace.buffer_disabled) tracing_off(); -#ifdef CONFIG_TRACER_MAX_TRACE - global_trace.max_buffer.data = alloc_percpu(struct trace_array_cpu); - if (!global_trace.max_buffer.data) { - printk(KERN_ERR "tracer: failed to allocate percpu memory!\n"); - WARN_ON(1); - goto out_free_cpumask; - } - global_trace.max_buffer.buffer = ring_buffer_alloc(1, rb_flags); - if (!global_trace.max_buffer.buffer) { - printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); - WARN_ON(1); - ring_buffer_free(global_trace.trace_buffer.buffer); - goto out_free_cpumask; - } - - for_each_tracing_cpu(i) { - memset(per_cpu_ptr(global_trace.max_buffer.data, i), 0, - sizeof(struct trace_array_cpu)); - per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.cpu = i; - per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.tr = &global_trace; - } -#endif - - /* Allocate the first page for all buffers */ - set_buffer_entries(&global_trace.trace_buffer, - ring_buffer_size(global_trace.trace_buffer.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE - set_buffer_entries(&global_trace.max_buffer, 1); -#endif - trace_init_cmdlines(); register_tracer(&nop_trace); -- cgit v1.2.3 From ce9bae55972b228cf7bac34350c4d2caf8ea0d0b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 21:23:55 -0500 Subject: tracing: Add snapshot feature to instances Add the "snapshot" file to the the multi-buffer instances. cd /sys/kernel/debug/tracing/instances mkdir foo ls foo buffer_size_kb buffer_total_size_kb events free_buffer set_event snapshot trace trace_clock trace_marker trace_options trace_pipe tracing_on cat foo/snapshot # tracer: nop # # # * Snapshot is freed * # # Snapshot commands: # echo 0 > snapshot : Clears and frees snapshot buffer # echo 1 > snapshot : Allocates snapshot buffer, if not already allocated. # Takes a snapshot of the main buffer. # echo 2 > snapshot : Clears snapshot buffer (but does not allocate) # (Doesn't have to be '2' works with any number that # is not a '0' or '1') Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 57895d476509..17671bc9a4b1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4302,9 +4302,9 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, local_irq_disable(); /* Now, we're going to swap */ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) - update_max_tr(&global_trace, current, smp_processor_id()); + update_max_tr(tr, current, smp_processor_id()); else - update_max_tr_single(&global_trace, current, iter->cpu_file); + update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); break; default: @@ -5533,6 +5533,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("tracing_on", 0644, d_tracer, tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *)&tr->trace_cpu, &snapshot_fops); +#endif } static __init int tracer_init_debugfs(void) @@ -5574,11 +5579,6 @@ static __init int tracer_init_debugfs(void) &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, - (void *)&global_trace.trace_cpu, &snapshot_fops); -#endif - create_trace_instances(d_tracer); create_trace_options_dir(&global_trace); -- cgit v1.2.3 From 121aaee7b0a82605d33af200c7e9ebab6fd6e444 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Tue, 5 Mar 2013 21:52:25 -0500 Subject: tracing: Add per_cpu directory into tracing instances Add the per_cpu directory to the created tracing instances: cd /sys/kernel/debug/tracing/instances mkdir foo ls foo/per_cpu/cpu0 buffer_size_kb snapshot_raw trace trace_pipe_raw snapshot stats trace_pipe Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 17671bc9a4b1..c547ebbe36ff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5506,6 +5506,7 @@ static __init void create_trace_instances(struct dentry *d_tracer) static void init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { + int cpu; trace_create_file("trace_options", 0644, d_tracer, tr, &tracing_iter_fops); @@ -5538,12 +5539,15 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("snapshot", 0644, d_tracer, (void *)&tr->trace_cpu, &snapshot_fops); #endif + + for_each_tracing_cpu(cpu) + tracing_init_debugfs_percpu(tr, cpu); + } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - int cpu; trace_access_lock_init(); @@ -5583,9 +5587,6 @@ static __init int tracer_init_debugfs(void) create_trace_options_dir(&global_trace); - for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(&global_trace, cpu); - return 0; } -- cgit v1.2.3 From a695cb5816228f86576f5f5c6809fdf8ed382ece Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Wed, 6 Mar 2013 15:27:24 -0500 Subject: tracing: Prevent deleting instances when they are being read Add a ref count to the trace_array structure and prevent removal of instances that have open descriptors. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 23 +++++++++++++++++++++++ kernel/trace/trace.h | 1 + 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c547ebbe36ff..3a89496dc99b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2613,6 +2613,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) tracing_iter_reset(iter, cpu); } + tr->ref++; + mutex_unlock(&trace_types_lock); return iter; @@ -2649,6 +2651,10 @@ static int tracing_release(struct inode *inode, struct file *file) tr = iter->tr; mutex_lock(&trace_types_lock); + + WARN_ON(!tr->ref); + tr->ref--; + for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -4460,6 +4466,10 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (!info) return -ENOMEM; + mutex_lock(&trace_types_lock); + + tr->ref++; + info->iter.tr = tr; info->iter.cpu_file = tc->cpu; info->iter.trace = tr->current_trace; @@ -4470,6 +4480,8 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) filp->private_data = info; + mutex_unlock(&trace_types_lock); + return nonseekable_open(inode, filp); } @@ -4568,10 +4580,17 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; + mutex_lock(&trace_types_lock); + + WARN_ON(!iter->tr->ref); + iter->tr->ref--; + if (info->spare) ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); kfree(info); + mutex_unlock(&trace_types_lock); + return 0; } @@ -5411,6 +5430,10 @@ static int instance_delete(const char *name) if (!found) goto out_unlock; + ret = -EBUSY; + if (tr->ref) + goto out_unlock; + list_del(&tr->list); event_trace_del_tracer(tr); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1a456c291a07..f4931821a966 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -219,6 +219,7 @@ struct trace_array { struct list_head systems; struct list_head events; struct task_struct *waiter; + int ref; }; enum { -- cgit v1.2.3 From ad909e21bbe69f1d39055d346540abd827190eca Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Wed, 6 Mar 2013 21:45:37 -0500 Subject: tracing: Add internal tracing_snapshot() functions The new snapshot feature is quite handy. It's a way for the user to take advantage of the spare buffer that, until then, only the latency tracers used to "snapshot" the buffer when it hit a max latency. Now users can trigger a "snapshot" manually when some condition is hit in a program. But a snapshot currently can not be triggered by a condition inside the kernel. With the addition of tracing_snapshot() and tracing_snapshot_alloc(), snapshots can now be taking when a condition is hit, and the developer wants to snapshot the case without stopping the trace. Note, any snapshot will overwrite the old one, so take care in how this is done. These new functions are to be used like tracing_on(), tracing_off() and trace_printk() are. That is, they should never be called in the mainline Linux kernel. They are solely for the purpose of debugging. The tracing_snapshot() will not allocate a buffer, but it is safe to be called from any context (except NMIs). But if a snapshot buffer isn't allocated when it is called, it will write to the live buffer, complaining about the lack of a snapshot buffer, and then stop tracing (giving you the "permanent snapshot"). tracing_snapshot_alloc() will allocate the snapshot buffer if it was not already allocated and then take the snapshot. This routine *may sleep*, and must be called from context that can sleep. The allocation is done with GFP_KERNEL and not atomic. If you need a snapshot in an atomic context, say in early boot, then it is best to call the tracing_snapshot_alloc() before then, where it will allocate the buffer, and then you can use the tracing_snapshot() anywhere you want and still get snapshots. Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/kernel.h | 4 +++ kernel/trace/trace.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index c566927efcbd..bc5392a326ab 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -483,6 +483,8 @@ enum ftrace_dump_mode { void tracing_on(void); void tracing_off(void); int tracing_is_on(void); +void tracing_snapshot(void); +void tracing_snapshot_alloc(void); extern void tracing_start(void); extern void tracing_stop(void); @@ -570,6 +572,8 @@ static inline void trace_dump_stack(void) { } static inline void tracing_on(void) { } static inline void tracing_off(void) { } static inline int tracing_is_on(void) { return 0; } +static inline void tracing_snapshot(void) { } +static inline void tracing_snapshot_alloc(void) { } static inline __printf(1, 2) int trace_printk(const char *fmt, ...) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a89496dc99b..307524d784ec 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -339,6 +339,90 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + struct tracer *tracer = tr->current_trace; + unsigned long flags; + + if (!tr->allocated_snapshot) { + trace_printk("*** SNAPSHOT NOT ALLOCATED ***\n"); + trace_printk("*** stopping trace here! ***\n"); + tracing_off(); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer->use_max_tr) { + trace_printk("*** LATENCY TRACER ACTIVE ***\n"); + trace_printk("*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id()); + local_irq_restore(flags); +} + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id); + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + if (WARN_ON(ret < 0)) + return; + + tr->allocated_snapshot = true; + } + + tracing_snapshot(); +} +#else +void tracing_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +void tracing_snapshot_alloc(void) +{ + /* Give warning */ + tracing_snapshot(); +} +#endif /* CONFIG_TRACER_SNAPSHOT */ + /** * tracing_off - turn off tracing buffers * -- cgit v1.2.3 From f5eb5588262cab7232ed1d77cf612b327db50767 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Thu, 7 Mar 2013 09:27:42 -0500 Subject: ring-buffer: Do not use schedule_work_on() for current CPU The ring buffer updates when done while the ring buffer is active, needs to be completed on the CPU that is used for the ring buffer per_cpu buffer. To accomplish this, schedule_work_on() is used to schedule work on the given CPU. Now there's no reason to use schedule_work_on() if the process doing the update happens to be on the CPU that it is processing. It has already filled the requirement. Instead, just do the work and continue. This is needed for tracing_snapshot_alloc() where it may be called really early in boot, where the work queues have not been set up yet. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ring_buffer.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 65fe2a4f9824..d1c85c5f5f51 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1679,11 +1679,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!cpu_buffer->nr_pages_to_update) continue; - if (cpu_online(cpu)) + /* The update must run on the CPU that is being updated. */ + preempt_disable(); + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rb_update_pages(cpu_buffer); + cpu_buffer->nr_pages_to_update = 0; + } else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); - else - rb_update_pages(cpu_buffer); + preempt_disable(); + } + preempt_enable(); } /* wait for all the updates to complete */ @@ -1721,12 +1732,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, get_online_cpus(); - if (cpu_online(cpu_id)) { + preempt_disable(); + /* The update must run on the CPU that is being updated. */ + if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + rb_update_pages(cpu_buffer); + else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); - } else - rb_update_pages(cpu_buffer); + preempt_disable(); + } + preempt_enable(); cpu_buffer->nr_pages_to_update = 0; put_online_cpus(); -- cgit v1.2.3 From f4e781c0a89d5810729772290441ac7d61f321ec Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Thu, 7 Mar 2013 11:10:56 -0500 Subject: tracing: Move the tracing selftest code into its own function Move the tracing startup selftest code into its own function and when not enabled, always have that function succeed. This makes the register_tracer() function much more readable. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 124 ++++++++++++++++++++++++++++----------------------- 1 file changed, 69 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 307524d784ec..57b4220d96a9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -818,6 +818,72 @@ static void default_wait_pipe(struct trace_iterator *iter) ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); } +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ + struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; + int ret; + + if (!type->selftest || tracing_selftest_disabled) + return 0; + + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + tracing_reset_online_cpus(&tr->trace_buffer); + + tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + tr->allocated_snapshot = true; + } +#endif + + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + tr->current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); + return -1; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + tr->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, 1, + RING_BUFFER_ALL_CPUS); + } +#endif + + printk(KERN_CONT "PASSED\n"); + return 0; +} +#else +static inline int run_tracer_selftest(struct tracer *type) +{ + return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + /** * register_tracer - register a tracer with the ftrace system. * @type - the plugin for the tracer @@ -863,61 +929,9 @@ int register_tracer(struct tracer *type) if (!type->wait_pipe) type->wait_pipe = default_wait_pipe; - -#ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest && !tracing_selftest_disabled) { - struct trace_array *tr = &global_trace; - struct tracer *saved_tracer = tr->current_trace; - - /* - * Run a selftest on this tracer. - * Here we reset the trace buffer, and set the current - * tracer to be this tracer. The tracer can then run some - * internal tracing to verify that everything is in order. - * If we fail, we do not register this tracer. - */ - tracing_reset_online_cpus(&tr->trace_buffer); - - tr->current_trace = type; - -#ifdef CONFIG_TRACER_MAX_TRACE - if (type->use_max_tr) { - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) - ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); - tr->allocated_snapshot = true; - } -#endif - - /* the test is responsible for initializing and enabling */ - pr_info("Testing tracer %s: ", type->name); - ret = type->selftest(type, tr); - /* the test is responsible for resetting too */ - tr->current_trace = saved_tracer; - if (ret) { - printk(KERN_CONT "FAILED!\n"); - /* Add the warning after printing 'FAILED' */ - WARN_ON(1); - goto out; - } - /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(&tr->trace_buffer); - -#ifdef CONFIG_TRACER_MAX_TRACE - if (type->use_max_tr) { - tr->allocated_snapshot = false; - - /* Shrink the max buffer again */ - if (ring_buffer_expanded) - ring_buffer_resize(tr->max_buffer.buffer, 1, - RING_BUFFER_ALL_CPUS); - } -#endif - - printk(KERN_CONT "PASSED\n"); - } -#endif + ret = run_tracer_selftest(type); + if (ret < 0) + goto out; type->next = trace_types; trace_types = type; -- cgit v1.2.3 From 55034cd6e648155393b0d665eef76b38d49ad6bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com> Date: Thu, 7 Mar 2013 22:48:09 -0500 Subject: tracing: Add alloc_snapshot kernel command line parameter If debugging the kernel, and the developer wants to use tracing_snapshot() in places where tracing_snapshot_alloc() may be difficult (or more likely, the developer is lazy and doesn't want to bother with tracing_snapshot_alloc() at all), then adding alloc_snapshot to the kernel command line parameter will tell ftrace to allocate the snapshot buffer (if configured) when it allocates the main tracing buffer. I also noticed that ring_buffer_expanded and tracing_selftest_disabled had inconsistent use of boolean "true" and "false" with "0" and "1". I cleaned that up too. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- Documentation/kernel-parameters.txt | 7 ++++ kernel/trace/trace.c | 81 ++++++++++++++++++++++--------------- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 4 +- 4 files changed, 58 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6c723811c0a0..0edc409f9ede 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -320,6 +320,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. on: enable for both 32- and 64-bit processes off: disable for both 32- and 64-bit processes + alloc_snapshot [FTRACE] + Allocate the ftrace snapshot buffer on boot up when the + main buffer is allocated. This is handy if debugging + and you need to use tracing_snapshot() on boot up, and + do not want to use tracing_snapshot_alloc() as it needs + to be done where GFP_KERNEL allocations are allowed. + amd_iommu= [HW,X86-64] Pass parameters to the AMD IOMMU driver in the system. Possible values are: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 57b4220d96a9..4021a5e66412 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -47,7 +47,7 @@ * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -int ring_buffer_expanded; +bool ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -121,12 +121,14 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; +static bool allocate_snapshot; + static int __init set_cmdline_ftrace(char *str) { strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -147,6 +149,15 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init alloc_snapshot(char *str) +{ + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + return 1; +} +__setup("alloc_snapshot", alloc_snapshot); + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_options __initdata; @@ -951,7 +962,7 @@ int register_tracer(struct tracer *type) tracing_set_tracer(type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = 1; + tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", type->name); @@ -3318,7 +3329,7 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr, * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; /* May be called before buffers are initialized */ if (!tr->trace_buffer.buffer) @@ -5396,53 +5407,57 @@ static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) } } -static int allocate_trace_buffers(struct trace_array *tr, int size) +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) { enum ring_buffer_flags rb_flags; rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - tr->trace_buffer.buffer = ring_buffer_alloc(size, rb_flags); - if (!tr->trace_buffer.buffer) - goto out_free; + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; - tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu); - if (!tr->trace_buffer.data) - goto out_free; + buf->data = alloc_percpu(struct trace_array_cpu); + if (!buf->data) { + ring_buffer_free(buf->buffer); + return -ENOMEM; + } - init_trace_buffers(tr, &tr->trace_buffer); + init_trace_buffers(tr, buf); /* Allocate the first page for all buffers */ set_buffer_entries(&tr->trace_buffer, ring_buffer_size(tr->trace_buffer.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE - - tr->max_buffer.buffer = ring_buffer_alloc(1, rb_flags); - if (!tr->max_buffer.buffer) - goto out_free; - - tr->max_buffer.data = alloc_percpu(struct trace_array_cpu); - if (!tr->max_buffer.data) - goto out_free; + return 0; +} - init_trace_buffers(tr, &tr->max_buffer); +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ + int ret; - set_buffer_entries(&tr->max_buffer, 1); -#endif - return 0; + ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + if (ret) + return ret; - out_free: - if (tr->trace_buffer.buffer) +#ifdef CONFIG_TRACER_MAX_TRACE + ret = allocate_trace_buffer(tr, &tr->max_buffer, + allocate_snapshot ? size : 1); + if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); - free_percpu(tr->trace_buffer.data); + free_percpu(tr->trace_buffer.data); + return -ENOMEM; + } + tr->allocated_snapshot = allocate_snapshot; -#ifdef CONFIG_TRACER_MAX_TRACE - if (tr->max_buffer.buffer) - ring_buffer_free(tr->max_buffer.buffer); - free_percpu(tr->max_buffer.data); + /* + * Only the top level trace array gets its snapshot allocated + * from the kernel command line. + */ + allocate_snapshot = false; #endif - return -ENOMEM; + return 0; } static int new_instance_create(const char *name) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f4931821a966..26bc71834041 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -660,7 +660,7 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; DECLARE_PER_CPU(int, ftrace_cpu_disabled); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a376ab5eec5c..38b54c5edeb9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1844,8 +1844,8 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = 1; - tracing_selftest_disabled = 1; + ring_buffer_expanded = true; + tracing_selftest_disabled = true; return 1; } -- cgit v1.2.3 From 153e8ed913b022d2003866a848af9fadc041403f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 8 Mar 2013 10:40:07 -0500 Subject: tracing: Fix the branch tracer that broke with buffer change The changce to add the trace_buffer struct to have the trace array have both the main buffer and max buffer broke the branch tracer because the change did not update that code. As the branch tracer adds a significant amount of overhead, and must be selected via a selection (not a allyesconfig) it was missed in testing. Reported-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_branch.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 6dadbefbb1d6..d594da0dc03c 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -52,12 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->data, cpu); + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (atomic_inc_return(&data->disabled) != 1) goto out; pc = preempt_count(); - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) -- cgit v1.2.3 From 09ae72348eccb60e304cf8ce94653f4a78fcd407 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 8 Mar 2013 21:02:34 -0500 Subject: tracing: Add trace_puts() for even faster trace_printk() tracing The trace_printk() is extremely fast and is very handy as it can be used in any context (including NMIs!). But it still requires scanning the fmt string for parsing the args. Even the trace_bprintk() requires a scan to know what args will be saved, although it doesn't copy the format string itself. Several times trace_printk() has no args, and wastes cpu cycles scanning the fmt string. Adding trace_puts() allows the developer to use an even faster tracing method that only saves the pointer to the string in the ring buffer without doing any format parsing at all. This will help remove even more of the "Heisenbug" effect, when debugging. Also fixed up the F_printk()s for the ftrace internal bprint and print events. Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/kernel.h | 41 +++++++++++++++++++++++- kernel/trace/trace.c | 76 ++++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_entries.h | 23 +++++++++++--- kernel/trace/trace_output.c | 75 +++++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_output.h | 2 ++ 6 files changed, 214 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index bc5392a326ab..a3a5574a61fc 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -514,7 +514,8 @@ do { \ * * This is intended as a debugging tool for the developer only. * Please refrain from leaving trace_printks scattered around in - * your code. + * your code. (Extra memory is used for special buffers that are + * allocated when trace_printk() is used) */ #define trace_printk(fmt, args...) \ @@ -537,6 +538,44 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...); extern __printf(2, 3) int __trace_printk(unsigned long ip, const char *fmt, ...); +/** + * trace_puts - write a string into the ftrace buffer + * @str: the string to record + * + * Note: __trace_bputs is an internal function for trace_puts and + * the @ip is passed in via the trace_puts macro. + * + * This is similar to trace_printk() but is made for those really fast + * paths that a developer wants the least amount of "Heisenbug" affects, + * where the processing of the print format is still too much. + * + * This function allows a kernel developer to debug fast path sections + * that printk is not appropriate for. By scattering in various + * printk like tracing in the code, a developer can quickly see + * where problems are occurring. + * + * This is intended as a debugging tool for the developer only. + * Please refrain from leaving trace_puts scattered around in + * your code. (Extra memory is used for special buffers that are + * allocated when trace_puts() is used) + * + * Returns: 0 if nothing was written, positive # if string was. + * (1 when __trace_bputs is used, strlen(str) when __trace_puts is used) + */ + +extern int __trace_bputs(unsigned long ip, const char *str); +extern int __trace_puts(unsigned long ip, const char *str, int size); +#define trace_puts(str) ({ \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(str) ? str : NULL; \ + \ + if (__builtin_constant_p(str)) \ + __trace_bputs(_THIS_IP_, trace_printk_fmt); \ + else \ + __trace_puts(_THIS_IP_, str, strlen(str)); \ +}) + extern void trace_dump_stack(void); /* diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4021a5e66412..5043a0c4dde0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -350,6 +350,77 @@ void tracing_on(void) } EXPORT_SYMBOL_GPL(tracing_on); +/** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + int alloc; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, preempt_count()); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + + return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct bputs_entry *entry; + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, preempt_count()); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + #ifdef CONFIG_TRACER_SNAPSHOT /** * trace_snapshot - take a snapshot of the current buffer. @@ -2475,6 +2546,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_BPUTS && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bputs_msg_only(iter); + if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 26bc71834041..d5764a8532e2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -34,6 +34,7 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BLK, + TRACE_BPUTS, __TRACE_LAST_TYPE, }; @@ -277,6 +278,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca2..e2d027ac66a2 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt), + F_printk("%pf: %s", + (void *)__entry->ip, __entry->fmt), FILTER_OTHER ); @@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%08lx %s", - __entry->ip, __entry->buf), + F_printk("%pf: %s", + (void *)__entry->ip, __entry->buf), + + FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + + TRACE_BPUTS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, str ) + ), + + F_printk("%pf: %s", + (void *)__entry->ip, __entry->str), FILTER_OTHER ); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 2edc7220d017..19f48e7edc39 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) return ret; } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bputs_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_puts(s, field->str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -1244,6 +1260,64 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bputs_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { + .trace = trace_bputs_print, + .raw = trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { + .type = TRACE_BPUTS, + .funcs = &trace_bputs_funcs, +}; + /* TRACE_BPRINT */ static enum print_line_t trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1356,6 +1430,7 @@ static struct trace_event *events[] __initdata = { &trace_wake_event, &trace_stack_event, &trace_user_stack_event, + &trace_bputs_event, &trace_bprint_event, &trace_print_event, NULL diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492b..af77870de278 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -4,6 +4,8 @@ #include <linux/trace_seq.h> #include "trace.h" +extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t -- cgit v1.2.3 From ca268da6e415448a43138e1abc5d5f057af319d7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Sat, 9 Mar 2013 00:40:58 -0500 Subject: tracing: Add internal ftrace trace_puts() for ftrace to use There's a few places that ftrace uses trace_printk() for internal use, but this requires context (normal, softirq, irq, NMI) buffers to keep things lockless. But the trace_puts() does not, as it can write the string directly into the ring buffer. Make a internal helper for trace_puts() and have the internal functions use that. This way the extra context buffers are not used. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 8 ++++---- kernel/trace/trace.h | 11 +++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5043a0c4dde0..d372c6504c99 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -443,16 +443,16 @@ void tracing_snapshot(void) unsigned long flags; if (!tr->allocated_snapshot) { - trace_printk("*** SNAPSHOT NOT ALLOCATED ***\n"); - trace_printk("*** stopping trace here! ***\n"); + internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); + internal_trace_puts("*** stopping trace here! ***\n"); tracing_off(); return; } /* Note, snapshot can not be used when the tracer uses it */ if (tracer->use_max_tr) { - trace_printk("*** LATENCY TRACER ACTIVE ***\n"); - trace_printk("*** Can not use snapshot (sorry) ***\n"); + internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); + internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); return; } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d5764a8532e2..0e430b401ab6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1040,6 +1040,17 @@ void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ -- cgit v1.2.3 From 1b22e382ab40b0e3ee5abb3e310dffb16fee22aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Sat, 9 Mar 2013 00:56:08 -0500 Subject: tracing: Let tracing_snapshot() be used by modules but not NMI Add EXPORT_SYMBOL_GPL() to let the tracing_snapshot() functions be called from modules. Also add a test to see if the snapshot was called from NMI context and just warn in the tracing buffer if so, and return. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d372c6504c99..5c53e4092269 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -442,6 +442,12 @@ void tracing_snapshot(void) struct tracer *tracer = tr->current_trace; unsigned long flags; + if (in_nmi()) { + internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + internal_trace_puts("*** snapshot is being ignored ***\n"); + return; + } + if (!tr->allocated_snapshot) { internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); internal_trace_puts("*** stopping trace here! ***\n"); @@ -460,6 +466,7 @@ void tracing_snapshot(void) update_max_tr(tr, current, smp_processor_id()); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(tracing_snapshot); static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); @@ -493,16 +500,19 @@ void tracing_snapshot_alloc(void) tracing_snapshot(); } +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #else void tracing_snapshot(void) { WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); } +EXPORT_SYMBOL_GPL(tracing_snapshot); void tracing_snapshot_alloc(void) { /* Give warning */ tracing_snapshot(); } +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ /** -- cgit v1.2.3 From 1c31714328be90764e46716f31fb0bd6da44c305 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Sat, 9 Mar 2013 08:36:53 -0500 Subject: tracing: Consolidate updating of count for traceon/off Remove some duplicate code and replace it with a helper function. This makes the code a it cleaner. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_functions.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index e467c0c7bdd5..38cfb290ecd9 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -214,38 +214,37 @@ static struct tracer function_trace __read_mostly = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data) { - long *count = (long *)data; - - if (tracing_is_on()) - return; + unsigned long *count = (long *)data; if (!*count) - return; + return 0; if (*count != -1) (*count)--; - tracing_on(); + return 1; } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) { - long *count = (long *)data; - - if (!tracing_is_on()) + if (tracing_is_on()) return; - if (!*count) - return; + if (update_count(data)) + tracing_on(); +} - if (*count != -1) - (*count)--; +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; - tracing_off(); + if (update_count(data)) + tracing_off(); } static int -- cgit v1.2.3 From 8b8fa62c60e03a53c46324075a8dc25821741daa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 09:25:00 -0400 Subject: tracing: Consolidate ftrace_trace_onoff_unreg() into callback The only thing ftrace_trace_onoff_unreg() does is to do a strcmp() against the cmd parameter to determine what op to unregister. But this compare is also done after the location that this function is called (and returns). By moving the check for '!' to unregister after the strcmp(), the callback function itself can just do the unregister and we can get rid of the helper function. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_functions.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 38cfb290ecd9..a88a3e0b0cc2 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -282,22 +282,6 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, return 0; } -static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) -{ - struct ftrace_probe_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; - - unregister_ftrace_function_probe_func(glob, ops); - - return 0; -} - static int ftrace_trace_onoff_callback(struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) @@ -311,15 +295,17 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, if (!enable) return -EINVAL; - if (glob[0] == '!') - return ftrace_trace_onoff_unreg(glob+1, cmd, param); - /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) ops = &traceon_probe_ops; else ops = &traceoff_probe_ops; + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + if (!param) goto out_reg; -- cgit v1.2.3 From 8380d24860e9d1659ab22896b86d7fe591c424fa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Sat, 9 Mar 2013 08:56:43 -0500 Subject: ftrace: Separate unlimited probes from count limited probes The function tracing probes that trigger traceon or traceoff can be set to unlimited, or given a count of # of times to execute. By separating these two types of probes, we can then use the dynamic ftrace function filtering directly, and remove the brute force "check if this function called is my probe" routines in ftrace. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_functions.c | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a88a3e0b0cc2..043b2425ae73 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -228,7 +228,7 @@ static int update_count(void **data) } static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { if (tracing_is_on()) return; @@ -238,7 +238,7 @@ ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) { if (!tracing_is_on()) return; @@ -247,10 +247,38 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) tracing_off(); } +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (tracing_is_on()) + return; + + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; + + tracing_off(); +} + static int ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data); +static struct ftrace_probe_ops traceon_count_probe_ops = { + .func = ftrace_traceon_count, + .print = ftrace_trace_onoff_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { + .func = ftrace_traceoff_count, + .print = ftrace_trace_onoff_print, +}; + static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, .print = ftrace_trace_onoff_print, @@ -269,7 +297,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, seq_printf(m, "%ps:", (void *)ip); - if (ops == &traceon_probe_ops) + if (ops == &traceon_probe_ops || ops == &traceon_count_probe_ops) seq_printf(m, "traceon"); else seq_printf(m, "traceoff"); @@ -297,9 +325,9 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, /* we register both traceon and traceoff to this callback */ if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; else - ops = &traceoff_probe_ops; + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; if (glob[0] == '!') { unregister_ftrace_function_probe_func(glob+1, ops); -- cgit v1.2.3 From e1df4cb682ab2c3c2981c8efa4aec044e61f4e06 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 10:09:42 -0400 Subject: ftrace: Fix function probe to only enable needed functions Currently the function probe enables all functions and runs a "hash" against every function call to see if it should call a probe. This is extremely wasteful. Note, a probe is something like: echo schedule:traceoff > /debug/tracing/set_ftrace_filter When schedule is called, the probe will disable tracing. But currently, it has a call back for *all* functions, and checks to see if the called function is the probe that is needed. The probe function has been created before ftrace was rewritten to allow for more than one "op" to be registered by the function tracer. When probes were created, it couldn't limit the functions without also limiting normal function calls. But now we can, it's about time to update the probe code. Todo, have separate ops for different entries. That is, assign a ftrace_ops per probe, instead of one op for all probes. But as there's not many probes assigned, this may not be that urgent. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e6effd0c40a9..dab031fec85b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2988,18 +2988,20 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp) kfree(entry); } - int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; int type, len, not; unsigned long key; int count = 0; char *search; + int ret; type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); @@ -3010,8 +3012,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) { + count = -ENOMEM; + goto out_unlock; + } + + if (unlikely(ftrace_disabled)) { + count = -ENODEV; goto out_unlock; + } do_for_each_ftrace_rec(pg, rec) { @@ -3043,6 +3053,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } } + ret = enter_record(hash, rec, 0); + if (ret < 0) { + kfree(entry); + count = ret; + goto out_unlock; + } + entry->ops = ops; entry->ip = rec->ip; @@ -3050,10 +3067,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); } while_for_each_ftrace_rec(); + + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + if (ret < 0) + count = ret; + __enable_ftrace_function_probe(); out_unlock: mutex_unlock(&ftrace_lock); + free_ftrace_hash(hash); return count; } @@ -3067,7 +3090,10 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash *hash; struct hlist_node *n, *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; @@ -3088,6 +3114,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } mutex_lock(&ftrace_lock); + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + /* Hmm, should report this somehow */ + goto out_unlock; + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3108,12 +3140,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } + rec_entry = ftrace_lookup_ip(hash, entry->ip); + /* It is possible more than one entry had this ip */ + if (rec_entry) + free_hash_entry(hash, rec_entry); + hlist_del_rcu(&entry->node); call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); } } __disable_ftrace_function_probe(); + /* + * Remove after the disable is called. Otherwise, if the last + * probe is removed, a null hash means *all enabled*. + */ + ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + out_unlock: mutex_unlock(&ftrace_lock); + free_ftrace_hash(hash); } void -- cgit v1.2.3 From 3209cff4490bee55fd2bc1d087cb8ecf2a686a88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 11:17:54 -0400 Subject: tracing: Add alloc/free_snapshot() to replace duplicate code Add alloc_snapshot() and free_snapshot() to allocate and free the snapshot buffer respectively, and use these to remove duplicate code. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 79 ++++++++++++++++++++++++++++------------------------ 1 file changed, 42 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c53e4092269..906049c0af90 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -149,14 +149,14 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); -static int __init alloc_snapshot(char *str) +static int __init boot_alloc_snapshot(char *str) { allocate_snapshot = true; /* We also need the main ring buffer expanded */ ring_buffer_expanded = true; return 1; } -__setup("alloc_snapshot", alloc_snapshot); +__setup("alloc_snapshot", boot_alloc_snapshot); static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; @@ -470,6 +470,38 @@ EXPORT_SYMBOL_GPL(tracing_snapshot); static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); + tr->allocated_snapshot = false; +} /** * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. @@ -487,16 +519,9 @@ void tracing_snapshot_alloc(void) struct trace_array *tr = &global_trace; int ret; - if (!tr->allocated_snapshot) { - - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->trace_buffer, RING_BUFFER_ALL_CPUS); - if (WARN_ON(ret < 0)) - return; - - tr->allocated_snapshot = true; - } + ret = alloc_snapshot(tr); + if (WARN_ON(ret < 0)) + return; tracing_snapshot(); } @@ -3581,15 +3606,7 @@ static int tracing_set_tracer(const char *buf) * so a synchronized_sched() is sufficient. */ synchronize_sched(); - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&tr->max_buffer, 1); - tracing_reset_online_cpus(&tr->max_buffer); - tr->allocated_snapshot = false; + free_snapshot(tr); } #endif destroy_trace_option_files(topts); @@ -3598,12 +3615,9 @@ static int tracing_set_tracer(const char *buf) #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { - /* we need to make per cpu buffer sizes equivalent */ - ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->trace_buffer, - RING_BUFFER_ALL_CPUS); + ret = alloc_snapshot(tr); if (ret < 0) goto out; - tr->allocated_snapshot = true; } #endif @@ -4475,14 +4489,8 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, ret = -EINVAL; break; } - if (tr->allocated_snapshot) { - /* free spare buffer */ - ring_buffer_resize(tr->max_buffer.buffer, 1, - RING_BUFFER_ALL_CPUS); - set_buffer_entries(&tr->max_buffer, 1); - tracing_reset_online_cpus(&tr->max_buffer); - tr->allocated_snapshot = false; - } + if (tr->allocated_snapshot) + free_snapshot(tr); break; case 1: /* Only allow per-cpu swap if the ring buffer supports it */ @@ -4493,12 +4501,9 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, } #endif if (!tr->allocated_snapshot) { - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&tr->max_buffer, - &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + ret = alloc_snapshot(tr); if (ret < 0) break; - tr->allocated_snapshot = true; } local_irq_disable(); /* Now, we're going to swap */ -- cgit v1.2.3 From 77fd5c15e3216b901be69047ca43b05ae9099951 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 11:49:18 -0400 Subject: tracing: Add snapshot trigger to function probes echo 'schedule:snapshot:1' > /debug/tracing/set_ftrace_filter This will cause the scheduler to trigger a snapshot the next time it's called (you can use any function that's not called by NMI). Even though it triggers only once, you still need to remove it with: echo '!schedule:snapshot:0' > /debug/tracing/set_ftrace_filter The :1 can be left off for the first command: echo 'schedule:snapshot' > /debug/tracing/set_ftrace_filter But this will cause all calls to schedule to trigger a snapshot. This must be removed without the ':0' echo '!schedule:snapshot' > /debug/tracing/set_ftrace_filter As adding a "count" is a different operation (internally). Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 906049c0af90..c5b844621562 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5086,7 +5086,114 @@ static const struct file_operations tracing_dyn_info_fops = { .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ + +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + tracing_snapshot(); +} + +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "snapshot"); + + if (count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + if (ret >= 0) + alloc_snapshot(&global_trace); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +static int register_snapshot_cmd(void) +{ + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) { @@ -6076,6 +6183,8 @@ __init static int tracer_alloc_buffers(void) trace_set_options(&global_trace, option); } + register_snapshot_cmd(); + return 0; out_free_cpumask: -- cgit v1.2.3 From e67efb93f0e9130174293ffaa5975f87b301b531 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 15:07:59 -0400 Subject: ftrace: Clean up function probe methods When a function probe is created, each function that the probe is attached to, a "callback" method is called. On release of the probe, each function entry calls the "free" method. First, "callback" is a confusing name and does not really match what it does. Callback sounds like it will be called when the probe triggers. But that's not the case. This is really an "init" function, so lets rename it as such. Secondly, both "init" and "free" do not pass enough information back to the handlers. Pass back the ops, ip and data for each time the method is called. We have the information, might as well use it. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace.h | 6 ++++-- kernel/trace/ftrace.c | 6 +++--- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e5ca8ef50e9b..832422d706f4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -259,8 +259,10 @@ struct ftrace_probe_ops { void (*func)(unsigned long ip, unsigned long parent_ip, void **data); - int (*callback)(unsigned long ip, void **data); - void (*free)(void **data); + int (*init)(struct ftrace_probe_ops *ops, + unsigned long ip, void **data); + void (*free)(struct ftrace_probe_ops *ops, + unsigned long ip, void **data); int (*print)(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index dab031fec85b..ff0ef41c6d93 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2984,7 +2984,7 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp) container_of(rhp, struct ftrace_func_probe, rcu); if (entry->ops->free) - entry->ops->free(&entry->data); + entry->ops->free(entry->ops, entry->ip, &entry->data); kfree(entry); } @@ -3045,8 +3045,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->callback) { - if (ops->callback(rec->ip, &entry->data) < 0) { + if (ops->init) { + if (ops->init(ops, rec->ip, &entry->data) < 0) { /* caller does not like this func */ kfree(entry); continue; -- cgit v1.2.3 From 7818b3886545f89549185e4023743e2df91d1fa1 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 12:42:58 -0400 Subject: ftrace: Use manual free after synchronize_sched() not call_rcu_sched() The entries to the probe hash must be freed after a synchronize_sched() after the entry has been removed from the hash. As the entries are registered with ops that may have their own callbacks, and these callbacks may sleep, we can not use call_rcu_sched() because the rcu callbacks registered with that are called from a softirq context. Instead of using call_rcu_sched(), manually save the entries on a free_list and at the end of the loop that removes the entries, do a synchronize_sched() and then go through the free_list, freeing the entries. Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ff0ef41c6d93..25770824598f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1068,7 +1068,7 @@ struct ftrace_func_probe { unsigned long flags; unsigned long ip; void *data; - struct rcu_head rcu; + struct list_head free_list; }; struct ftrace_func_entry { @@ -2978,11 +2978,8 @@ static void __disable_ftrace_function_probe(void) } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry) { - struct ftrace_func_probe *entry = - container_of(rhp, struct ftrace_func_probe, rcu); - if (entry->ops->free) entry->ops->free(entry->ops, entry->ip, &entry->data); kfree(entry); @@ -3092,7 +3089,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, { struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; + struct ftrace_func_probe *p; struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct list_head free_list; struct ftrace_hash *hash; struct hlist_node *n, *tmp; char str[KSYM_SYMBOL_LEN]; @@ -3120,6 +3119,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, /* Hmm, should report this somehow */ goto out_unlock; + INIT_LIST_HEAD(&free_list); + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3146,7 +3147,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, free_hash_entry(hash, rec_entry); hlist_del_rcu(&entry->node); - call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu); + list_add(&entry->free_list, &free_list); } } __disable_ftrace_function_probe(); @@ -3155,6 +3156,12 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * probe is removed, a null hash means *all enabled*. */ ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + synchronize_sched(); + list_for_each_entry_safe(entry, p, &free_list, free_list) { + list_del(&entry->free_list); + ftrace_free_entry(entry); + } + out_unlock: mutex_unlock(&ftrace_lock); free_ftrace_hash(hash); -- cgit v1.2.3 From 417944c4c7a0f657158d0515f3b8e8c043fd788f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 13:26:18 -0400 Subject: tracing: Add a way to soft disable trace events In order to let triggers enable or disable events, we need a 'soft' method for doing so. For example, if a function probe is added that lets a user enable or disable events when a function is called, that change must be done without taking locks or a mutex, and definitely it can't sleep. But the full enabling of a tracepoint is expensive. By adding a 'SOFT_DISABLE' flag, and converting the flags to be updated without the protection of a mutex (using set/clear_bit()), this soft disable flag can be used to allow critical sections to enable or disable events from being traced (after the event has been placed into "SOFT_MODE"). Some caveats though: The comm recorder (to map pids with a comm) can not be soft disabled (yet). If you disable an event with with a "soft" disable and wait a while before reading the trace, the comm cache may be replaced and you'll get a bunch of <...> for comms in the trace. Reading the "enable" file for an event that is disabled will now give you "0*" where the '*' denotes that the tracepoint is still active but the event itself is "disabled". [ fixed _BIT used in & operation : thanks to Dan Carpenter and smatch ] Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace_event.h | 20 ++++++++---- include/trace/ftrace.h | 8 +++++ kernel/trace/trace_events.c | 75 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 84 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4cb6cd8338a4..4e28b011e63b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -251,16 +251,23 @@ struct ftrace_subsystem_dir; enum { FTRACE_EVENT_FL_ENABLED_BIT, FTRACE_EVENT_FL_RECORDED_CMD_BIT, + FTRACE_EVENT_FL_SOFT_MODE_BIT, + FTRACE_EVENT_FL_SOFT_DISABLED_BIT, }; /* * Ftrace event file flags: * ENABLED - The event is enabled * RECORDED_CMD - The comms should be recorded at sched_switch + * SOFT_MODE - The event is enabled/disabled by SOFT_DISABLED + * SOFT_DISABLED - When set, do not trace the event (even though its + * tracepoint may be enabled) */ enum { FTRACE_EVENT_FL_ENABLED = (1 << FTRACE_EVENT_FL_ENABLED_BIT), FTRACE_EVENT_FL_RECORDED_CMD = (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT), + FTRACE_EVENT_FL_SOFT_MODE = (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT), + FTRACE_EVENT_FL_SOFT_DISABLED = (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT), }; struct ftrace_event_file { @@ -274,17 +281,18 @@ struct ftrace_event_file { * 32 bit flags: * bit 0: enabled * bit 1: enabled cmd record + * bit 2: enable/disable with the soft disable bit + * bit 3: soft disabled * - * Changes to flags must hold the event_mutex. - * - * Note: Reads of flags do not hold the event_mutex since - * they occur in critical sections. But the way flags + * Note: The bits must be set atomically to prevent races + * from other writers. Reads of flags do not need to be in + * sync as they occur in critical sections. But the way flags * is currently used, these changes do not affect the code * except that when a change is made, it may have a slight * delay in propagating the changes to other CPUs due to - * caching and such. + * caching and such. Which is mostly OK ;-) */ - unsigned int flags; + unsigned long flags; }; #define __TRACE_EVENT_FLAGS(name, value) \ diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index bbf09c2021b9..4bda044e6c77 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -413,6 +413,10 @@ static inline notrace int ftrace_get_offsets_##call( \ * int __data_size; * int pc; * + * if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, + * &ftrace_file->flags)) + * return; + * * local_save_flags(irq_flags); * pc = preempt_count(); * @@ -518,6 +522,10 @@ ftrace_raw_event_##call(void *__data, proto) \ int __data_size; \ int pc; \ \ + if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, \ + &ftrace_file->flags)) \ + return; \ + \ local_save_flags(irq_flags); \ pc = preempt_count(); \ \ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 38b54c5edeb9..106640b0df4a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -205,37 +205,77 @@ void trace_event_enable_cmd_record(bool enable) if (enable) { tracing_start_cmdline_record(); - file->flags |= FTRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); - file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int ftrace_event_enable_disable(struct ftrace_event_file *file, - int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) { struct ftrace_event_call *call = file->event_call; int ret = 0; + int disable; switch (enable) { case 0: - if (file->flags & FTRACE_EVENT_FL_ENABLED) { - file->flags &= ~FTRACE_EVENT_FL_ENABLED; + /* + * When soft_disable is set and enable is cleared, we want + * to clear the SOFT_DISABLED flag but leave the event in the + * state that it was. That is, if the event was enabled and + * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED + * is set we do not want the event to be enabled before we + * clear the bit. + * + * When soft_disable is not set but the SOFT_MODE flag is, + * we do nothing. Do not disable the tracepoint, otherwise + * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + */ + if (soft_disable) { + disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; + clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } else + disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + + if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { + clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } call->class->reg(call, TRACE_REG_UNREGISTER, file); } + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ + if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: + /* + * When soft_disable is set and enable is set, we want to + * register the tracepoint for the event, but leave the event + * as is. That means, if the event was already enabled, we do + * nothing (but set SOFT_MODE). If the event is disabled, we + * set SOFT_DISABLED before enabling the event tracepoint, so + * it still seems to be disabled. + */ + if (!soft_disable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + + /* Keep the event disabled, when going to SOFT_MODE. */ + if (soft_disable) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + if (trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - file->flags |= FTRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { @@ -244,7 +284,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file, "%s\n", call->name); break; } - file->flags |= FTRACE_EVENT_FL_ENABLED; + set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); /* WAS_ENABLED gets set but never cleared. */ call->flags |= TRACE_EVENT_FL_WAS_ENABLED; @@ -255,6 +295,12 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file, return ret; } +static int ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable) +{ + return __ftrace_event_enable_disable(file, enable, 0); +} + static void ftrace_clear_events(struct trace_array *tr) { struct ftrace_event_file *file; @@ -547,12 +593,15 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_file *file = filp->private_data; char *buf; - if (file->flags & FTRACE_EVENT_FL_ENABLED) - buf = "1\n"; - else + if (file->flags & FTRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) + buf = "0*\n"; + else + buf = "1\n"; + } else buf = "0\n"; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t -- cgit v1.2.3 From 3cd715de261182413b3487abfffe1b6af41b81b3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 12 Mar 2013 19:35:13 -0400 Subject: tracing: Add function probe triggers to enable/disable events Add triggers to function tracer that lets an event get enabled or disabled when a function is called: format is: <function>:enable_event:<system>:<event>[:<count>] <function>:disable_event:<system>:<event>[:<count>] echo 'schedule:enable_event:sched:sched_switch' > /debug/tracing/set_ftrace_filter Every time schedule is called, it will enable the sched_switch event. echo 'schedule:disable_event:sched:sched_switch:2' > /debug/tracing/set_ftrace_filter The first two times schedule is called while the sched_switch event is enabled, it will disable it. It will not count for a time that the event is already disabled (or enabled for enable_event). [ fixed return without mutex_unlock() - thanks to Dan Carpenter and smatch ] Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: Tom Zanussi <tom.zanussi@linux.intel.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_events.c | 279 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 106640b0df4a..c636523b1a59 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1798,6 +1798,283 @@ __trace_add_event_dirs(struct trace_array *tr) } } +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + +static struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct ftrace_event_file *file; + struct ftrace_event_call *call; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + + if (!call->name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (strcmp(event, call->name) == 0 && + strcmp(system, call->class->system) == 0) + return file; + } + return NULL; +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *_data) +{ + struct event_probe_data *data = _data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "%s:%s:%s", + data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + data->file->event_call->class->system, + data->file->event_call->name); + + if (data->count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", data->count); + + return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + data->ref++; + return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(data->file, 0, 1); + module_put(data->file->event_call->mod); + kfree(data); + } + *pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enabled) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_file *file; + struct ftrace_probe_ops *ops; + struct event_probe_data *data; + const char *system; + const char *event; + char *number; + bool enable; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enabled) + return -EINVAL; + + if (!param) + return -EINVAL; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + mutex_lock(&event_mutex); + + ret = -EINVAL; + file = find_event_file(tr, system, event); + if (!file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; + else + ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + ret = 0; + goto out; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + data->enable = enable; + data->count = -1; + data->file = file; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &data->count); + if (ret) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(file->event_call->mod); + if (!ret) + goto out_free; + + ret = __ftrace_event_enable_disable(file, 1, 1); + if (ret < 0) + goto out_put; + ret = register_ftrace_function_probe(glob, ops, data); + if (!ret) + goto out_disable; + out: + mutex_unlock(&event_mutex); + return ret; + + out_disable: + __ftrace_event_enable_disable(file, 0, 1); + out_put: + module_put(file->event_call->mod); + out_free: + kfree(data); + goto out; +} + +static struct ftrace_func_command event_enable_cmd = { + .name = ENABLE_EVENT_STR, + .func = event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { + .name = DISABLE_EVENT_STR, + .func = event_enable_func, +}; + +static __init int register_event_cmds(void) +{ + int ret; + + ret = register_ftrace_command(&event_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_ftrace_command(&event_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_ftrace_command(&event_enable_cmd); + return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + /* * The top level array has already had its ftrace_event_file * descriptors created in order to allow for early events to @@ -2058,6 +2335,8 @@ static __init int event_trace_enable(void) trace_printk_start_comm(); + register_event_cmds(); + return 0; } -- cgit v1.2.3 From c142be8ebe0b7bf73c8a0063925623f3e4b980c0 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 09:55:57 -0400 Subject: tracing: Add skip argument to trace_dump_stack() Altough the trace_dump_stack() already skips three functions in the call to stack trace, which gets the stack trace to start at the caller of the function, the caller may want to skip some more too (as it may have helper functions). Add a skip argument to the trace_dump_stack() that lets the caller skip back tracing functions that it doesn't care about. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/kernel.h | 2 +- kernel/trace/trace.c | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d0a16fe03fef..239dbb9627ca 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -597,7 +597,7 @@ extern int __trace_puts(unsigned long ip, const char *str, int size); __trace_puts(_THIS_IP_, str, strlen(str)); \ }) -extern void trace_dump_stack(void); +extern void trace_dump_stack(int skip); /* * The double __builtin_constant_p is because gcc will give us an error diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c5b844621562..8aa53213201f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1657,8 +1657,9 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, /** * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers) */ -void trace_dump_stack(void) +void trace_dump_stack(int skip) { unsigned long flags; @@ -1667,9 +1668,13 @@ void trace_dump_stack(void) local_save_flags(flags); - /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, 3, - preempt_count(), NULL); + /* + * Skip 3 more, seems to get us at the caller of + * this function. + */ + skip += 3; + __ftrace_trace_stack(global_trace.trace_buffer.buffer, + flags, skip, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); -- cgit v1.2.3 From dd42cd3ea96d687f15525c4f14fa582702db223f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 10:17:50 -0400 Subject: tracing: Add function probe to trigger stack traces Add a function probe that will cause a stack trace to be traced in the ring buffer when the given function(s) are called. format is: <function>:stacktrace[:<count>] echo 'schedule:stacktrace' > /debug/tracing/set_ftrace_filter cat /debug/tracing/trace_pipe kworker/2:0-4329 [002] ...2 2933.558007: <stack trace> => kthread => ret_from_fork <idle>-0 [000] .N.2 2933.558019: <stack trace> => rest_init => start_kernel => x86_64_start_reservations => x86_64_start_kernel kworker/2:0-4329 [002] ...2 2933.558109: <stack trace> => kthread => ret_from_fork [...] This can be set to only trace a specific amount of times: echo 'schedule:stacktrace:3' > /debug/tracing/set_ftrace_filter cat /debug/tracing/trace_pipe <...>-58 [003] ...2 841.801694: <stack trace> => kthread => ret_from_fork <idle>-0 [001] .N.2 841.801697: <stack trace> => start_secondary <...>-2059 [001] ...2 841.801736: <stack trace> => wait_for_common => wait_for_completion => flush_work => tty_flush_to_ldisc => input_available_p => n_tty_poll => tty_poll => do_select => core_sys_select => sys_select => system_call_fastpath To remove these: echo '!schedule:stacktrace' > /debug/tracing/set_ftrace_filter echo '!schedule:stacktrace:0' > /debug/tracing/set_ftrace_filter Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_functions.c | 150 +++++++++++++++++++++++++++++++---------- 1 file changed, 115 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 043b2425ae73..c4d6d7191988 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -265,56 +265,103 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) tracing_off(); } +/* + * Skip 4: + * ftrace_stacktrace() + * function_trace_probe_call() + * ftrace_ops_list_func() + * ftrace_call() + */ +#define STACK_SKIP 4 + +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ + trace_dump_stack(STACK_SKIP); +} + +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; + + if (update_count(data)) + trace_dump_stack(STACK_SKIP); +} + static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data); +ftrace_probe_print(const char *name, struct seq_file *m, + unsigned long ip, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:%s", (void *)ip, name); + + if (count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static int +ftrace_traceon_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceon", m, ip, data); +} + +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceoff", m, ip, data); +} + +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("stacktrace", m, ip, data); +} static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, - .print = ftrace_trace_onoff_print, + .print = ftrace_traceon_print, }; static struct ftrace_probe_ops traceoff_count_probe_ops = { .func = ftrace_traceoff_count, - .print = ftrace_trace_onoff_print, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { + .func = ftrace_stacktrace_count, + .print = ftrace_stacktrace_print, }; static struct ftrace_probe_ops traceon_probe_ops = { .func = ftrace_traceon, - .print = ftrace_trace_onoff_print, + .print = ftrace_traceon_print, }; static struct ftrace_probe_ops traceoff_probe_ops = { .func = ftrace_traceoff, - .print = ftrace_trace_onoff_print, + .print = ftrace_traceoff_print, }; -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) -{ - long count = (long)data; - - seq_printf(m, "%ps:", (void *)ip); - - if (ops == &traceon_probe_ops || ops == &traceon_count_probe_ops) - seq_printf(m, "traceon"); - else - seq_printf(m, "traceoff"); - - if (count == -1) - seq_printf(m, ":unlimited\n"); - else - seq_printf(m, ":count=%ld\n", count); - - return 0; -} +static struct ftrace_probe_ops stacktrace_probe_ops = { + .func = ftrace_stacktrace, + .print = ftrace_stacktrace_print, +}; static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, + struct ftrace_hash *hash, char *glob, + char *cmd, char *param, int enable) { - struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; @@ -323,12 +370,6 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, if (!enable) return -EINVAL; - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; - else - ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; - if (glob[0] == '!') { unregister_ftrace_function_probe_func(glob+1, ops); return 0; @@ -356,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, return ret < 0 ? ret : 0; } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; + else + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -366,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = { .func = ftrace_trace_onoff_callback, }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { + .name = "stacktrace", + .func = ftrace_stacktrace_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -377,6 +451,12 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) unregister_ftrace_command(&ftrace_traceoff_cmd); + + ret = register_ftrace_command(&ftrace_stacktrace_cmd); + if (ret) { + unregister_ftrace_command(&ftrace_traceoff_cmd); + unregister_ftrace_command(&ftrace_traceon_cmd); + } return ret; } #else -- cgit v1.2.3 From 87889501d0adfae10e3b0f0e6f2d7536eed9ae84 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 20:43:57 -0400 Subject: tracing: Use stack of calling function for stack tracer Use the stack of stack_trace_call() instead of check_stack() as the test pointer for max stack size. It makes it a bit cleaner and a little more accurate. Adding stable, as a later fix depends on this patch. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_stack.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc701..dc02e29d8255 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -39,20 +39,21 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void +check_stack(unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; int i; - this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; if (this_size <= max_stack_size) return; /* we do not handle interrupt stacks yet */ - if (!object_is_on_stack(&this_size)) + if (!object_is_on_stack(stack)) return; local_irq_save(flags); @@ -73,7 +74,7 @@ static inline void check_stack(void) * Now find where in the stack these are. */ i = 0; - start = &this_size; + start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -113,6 +114,7 @@ static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { + unsigned long stack; int cpu; preempt_disable_notrace(); @@ -122,7 +124,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - check_stack(); + check_stack(&stack); out: per_cpu(trace_active, cpu)--; -- cgit v1.2.3 From d4ecbfc49b4b1d4b597fb5ba9e4fa25d62f105c5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 21:25:35 -0400 Subject: tracing: Fix stack tracer with fentry use When gcc 4.6 on x86 is used, the function tracer will use the new option -mfentry which does a call to "fentry" at every function instead of "mcount". The significance of this is that fentry is called as the first operation of the function instead of the mcount usage of being called after the stack. This causes the stack tracer to show some bogus results for the size of the last function traced, as well as showing "ftrace_call" instead of the function. This is due to the stack frame not being set up by the function that is about to be traced. # cat stack_trace Depth Size Location (48 entries) ----- ---- -------- 0) 4824 216 ftrace_call+0x5/0x2f 1) 4608 112 ____cache_alloc+0xb7/0x22d 2) 4496 80 kmem_cache_alloc+0x63/0x12f The 216 size for ftrace_call includes both the ftrace_call stack (which includes the saving of registers it does), as well as the stack size of the parent. To fix this, if CC_USING_FENTRY is defined, then the stack_tracer will reserve the first item in stack_dump_trace[] array when calling save_stack_trace(), and it will fill it in with the parent ip. Then the code will look for the parent pointer on the stack and give the real size of the parent's stack pointer: # cat stack_trace Depth Size Location (14 entries) ----- ---- -------- 0) 2640 48 update_group_power+0x26/0x187 1) 2592 224 update_sd_lb_stats+0x2a5/0x4ac 2) 2368 160 find_busiest_group+0x31/0x1f1 3) 2208 256 load_balance+0xd9/0x662 I'm Cc'ing stable, although it's not urgent, as it only shows bogus size for item #0, the rest of the trace is legit. It should still be corrected in previous stable releases. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_stack.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dc02e29d8255..ea28e4b0ed58 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,13 +20,27 @@ #define STACK_TRACE_ENTRIES 500 +/* + * If fentry is used, then the function being traced will + * jump to fentry directly before it sets up its stack frame. + * We need to ignore that one and record the parent. Since + * the stack frame for the traced function wasn't set up yet, + * the stack_trace wont see the parent. That needs to be added + * manually to stack_dump_trace[] as the first element. + */ +#ifdef CC_USING_FENTRY +# define add_func 1 +#else +# define add_func 0 +#endif + static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; static struct stack_trace max_stack_trace = { - .max_entries = STACK_TRACE_ENTRIES, - .entries = stack_dump_trace, + .max_entries = STACK_TRACE_ENTRIES - add_func, + .entries = &stack_dump_trace[add_func], }; static unsigned long max_stack_size; @@ -40,7 +54,7 @@ int stack_tracer_enabled; static int last_stack_tracer_enabled; static inline void -check_stack(unsigned long *stack) +check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; @@ -70,6 +84,17 @@ check_stack(unsigned long *stack) save_stack_trace(&max_stack_trace); + /* + * When fentry is used, the traced function does not get + * its stack frame set up, and we lose the parent. + * Add that one in manally. We set up save_stack_trace() + * to not touch the first element in this case. + */ + if (add_func) { + stack_dump_trace[0] = ip; + max_stack_trace.nr_entries++; + } + /* * Now find where in the stack these are. */ @@ -124,7 +149,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - check_stack(&stack); + check_stack(parent_ip, &stack); out: per_cpu(trace_active, cpu)--; -- cgit v1.2.3 From 4df297129f622bdc18935c856f42b9ddd18f9f28 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 13 Mar 2013 23:34:22 -0400 Subject: tracing: Remove most or all of stack tracer stack size from stack_max_size Currently, the depth reported in the stack tracer stack_trace file does not match the stack_max_size file. This is because the stack_max_size includes the overhead of stack tracer itself while the depth does not. The first time a max is triggered, a calculation is not performed that figures out the overhead of the stack tracer and subtracts it from the stack_max_size variable. The overhead is stored and is subtracted from the reported stack size for comparing for a new max. Now the stack_max_size corresponds to the reported depth: # cat stack_max_size 4640 # cat stack_trace Depth Size Location (48 entries) ----- ---- -------- 0) 4640 32 _raw_spin_lock+0x18/0x24 1) 4608 112 ____cache_alloc+0xb7/0x22d 2) 4496 80 kmem_cache_alloc+0x63/0x12f 3) 4416 16 mempool_alloc_slab+0x15/0x17 [...] While testing against and older gcc on x86 that uses mcount instead of fentry, I found that pasing in ip + MCOUNT_INSN_SIZE let the stack trace show one more function deep which was missing before. Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_stack.c | 75 +++++++++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index ea28e4b0ed58..aab277b67fa9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,27 +20,24 @@ #define STACK_TRACE_ENTRIES 500 -/* - * If fentry is used, then the function being traced will - * jump to fentry directly before it sets up its stack frame. - * We need to ignore that one and record the parent. Since - * the stack frame for the traced function wasn't set up yet, - * the stack_trace wont see the parent. That needs to be added - * manually to stack_dump_trace[] as the first element. - */ #ifdef CC_USING_FENTRY -# define add_func 1 +# define fentry 1 #else -# define add_func 0 +# define fentry 0 #endif static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */ static struct stack_trace max_stack_trace = { - .max_entries = STACK_TRACE_ENTRIES - add_func, - .entries = &stack_dump_trace[add_func], + .max_entries = STACK_TRACE_ENTRIES - 1, + .entries = &stack_dump_trace[1], }; static unsigned long max_stack_size; @@ -58,10 +55,14 @@ check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; + static int tracer_frame; + int frame_size = ACCESS_ONCE(tracer_frame); int i; this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; + /* Remove the frame of the tracer */ + this_size -= frame_size; if (this_size <= max_stack_size) return; @@ -73,6 +74,10 @@ check_stack(unsigned long ip, unsigned long *stack) local_irq_save(flags); arch_spin_lock(&max_stack_lock); + /* In case another CPU set the tracer_frame on us */ + if (unlikely(!frame_size)) + this_size -= tracer_frame; + /* a race could have already updated it */ if (this_size <= max_stack_size) goto out; @@ -85,15 +90,12 @@ check_stack(unsigned long ip, unsigned long *stack) save_stack_trace(&max_stack_trace); /* - * When fentry is used, the traced function does not get - * its stack frame set up, and we lose the parent. - * Add that one in manally. We set up save_stack_trace() - * to not touch the first element in this case. + * Add the passed in ip from the function tracer. + * Searching for this on the stack will skip over + * most of the overhead from the stack tracer itself. */ - if (add_func) { - stack_dump_trace[0] = ip; - max_stack_trace.nr_entries++; - } + stack_dump_trace[0] = ip; + max_stack_trace.nr_entries++; /* * Now find where in the stack these are. @@ -123,6 +125,18 @@ check_stack(unsigned long ip, unsigned long *stack) found = 1; /* Start the search from here */ start = p + 1; + /* + * We do not want to show the overhead + * of the stack tracer stack in the + * max stack. If we haven't figured + * out what that is, then figure it out + * now. + */ + if (unlikely(!tracer_frame) && i == 1) { + tracer_frame = (p - stack) * + sizeof(unsigned long); + max_stack_size -= tracer_frame; + } } } @@ -149,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - check_stack(parent_ip, &stack); + /* + * When fentry is used, the traced function does not get + * its stack frame set up, and we lose the parent. + * The ip is pretty useless because the function tracer + * was called before that function set up its stack frame. + * In this case, we use the parent ip. + * + * By adding the return address of either the parent ip + * or the current ip we can disregard most of the stack usage + * caused by the stack tracer itself. + * + * The function tracer always reports the address of where the + * mcount call was, but the stack will hold the return address. + */ + if (fentry) + ip = parent_ip; + else + ip += MCOUNT_INSN_SIZE; + + check_stack(ip, &stack); out: per_cpu(trace_active, cpu)--; -- cgit v1.2.3 From 328df4759c03e2c3e7429cc6cb0e180c38f32063 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 14 Mar 2013 12:10:40 -0400 Subject: tracing: Add function-trace option to disable function tracing of latency tracers Currently, the only way to stop the latency tracers from doing function tracing is to fully disable the function tracer from the proc file system: echo 0 > /proc/sys/kernel/ftrace_enabled This is a big hammer approach as it disables function tracing for all users. This includes kprobes, perf, stack tracer, etc. Instead, create a function-trace option that the latency tracers can check to determine if it should enable function tracing or not. This option can be set or cleared even while the tracer is active and the tracers will disable or enable function tracing depending on how the option was set. Instead of using the proc file, disable latency function tracing with echo 0 > /debug/tracing/options/function-trace Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Clark Williams <williams@redhat.com> Cc: John Kacur <jkacur@redhat.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 3 +- kernel/trace/trace.h | 1 + kernel/trace/trace_irqsoff.c | 67 ++++++++++++++++++++++++++++++++------- kernel/trace/trace_sched_wakeup.c | 63 ++++++++++++++++++++++++++++++------ 4 files changed, 111 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8aa53213201f..f90ca16afcf2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -328,7 +328,7 @@ static inline void trace_access_lock_init(void) unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; /** * tracing_on - enable tracing buffers @@ -635,6 +635,7 @@ static const char *trace_options[] = { "disable_on_free", "irq-info", "markers", + "function-trace", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 0e430b401ab6..5cc52361bc9f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -867,6 +867,7 @@ enum trace_iterator_flags { TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, TRACE_ITER_MARKERS = 0x1000000, + TRACE_ITER_FUNCTION = 0x2000000, }; /* diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5aa40ab72b57..b19d065a28cb 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -33,6 +33,7 @@ enum { static int trace_type __read_mostly; static int save_flags; +static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(int graph, int set) { - int ret = 0; + int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); + else + ret = register_ftrace_function(&trace_ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_irqsoff_function(int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(&trace_ops); + + function_enabled = false; +} + +static void irqsoff_function_set(int set) +{ + if (set) + register_irqsoff_function(is_graph(), 1); + else + unregister_irqsoff_function(is_graph()); +} + +static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +{ + if (mask & TRACE_ITER_FUNCTION) + irqsoff_function_set(set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_irqsoff_function(graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -550,10 +596,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_irqsoff_function(graph); } static void __irqsoff_tracer_init(struct trace_array *tr) @@ -615,7 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -649,7 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -685,7 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index c16f8cd63c3c..fee77e15d815 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -37,6 +37,7 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_flags; +static bool function_enabled; #define TRACE_DISPLAY_GRAPH 1 @@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(int graph, int set) { int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); + else + ret = register_ftrace_function(&trace_ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_wakeup_function(int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(&trace_ops); + + function_enabled = false; +} + +static void wakeup_function_set(int set) +{ + if (set) + register_wakeup_function(is_graph(), 1); + else + unregister_wakeup_function(is_graph()); +} + +static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +{ + if (mask & TRACE_ITER_FUNCTION) + wakeup_function_set(set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(int graph) +{ + int ret; + + ret = register_wakeup_function(graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -156,10 +202,7 @@ static void stop_func_tracer(int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_wakeup_function(graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -600,7 +643,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -622,7 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, - .flag_changed = trace_keep_overwrite, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif -- cgit v1.2.3 From 8aacf017b065a805d27467843490c976835eb4a5 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 14 Mar 2013 13:13:45 -0400 Subject: tracing: Add "uptime" trace clock that uses jiffies Add a simple trace clock called "uptime" for those that are interested in the uptime of the trace. It uses jiffies as that's the safest method, as other uptime clocks grab seq locks, which could cause a deadlock if taken from an event or function tracer. Requested-by: Mauro Carvalho Chehab <mchehab@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/trace_clock.h | 1 + kernel/trace/trace.c | 1 + kernel/trace/trace_clock.c | 10 ++++++++++ 3 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h index d563f37e1a1d..1d7ca2739272 100644 --- a/include/linux/trace_clock.h +++ b/include/linux/trace_clock.h @@ -16,6 +16,7 @@ extern u64 notrace trace_clock_local(void); extern u64 notrace trace_clock(void); +extern u64 notrace trace_clock_jiffies(void); extern u64 notrace trace_clock_global(void); extern u64 notrace trace_clock_counter(void); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f90ca16afcf2..8eabfbb8003e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -647,6 +647,7 @@ static struct { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 1 }, ARCH_TRACE_CLOCKS }; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae6..26dc348332b7 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -57,6 +57,16 @@ u64 notrace trace_clock(void) return local_clock(); } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + */ +u64 notrace trace_clock_jiffies(void) +{ + u64 jiffy = jiffies - INITIAL_JIFFIES; + + /* Return nsecs */ + return (u64)jiffies_to_usecs(jiffy) * 1000ULL; +} /* * trace_clock_global(): special globally coherent trace clock -- cgit v1.2.3 From 76f119179b8ce3188a8c61d2486d37810a416655 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Thu, 14 Mar 2013 17:53:25 -0400 Subject: tracing: Add "perf" trace_clock The function trace_clock() calls "local_clock()" which is exactly the same clock that perf uses. I'm not sure why perf doesn't call trace_clock(), as trace_clock() doesn't have any users. But now it does. As trace_clock() calls local_clock() like perf does, I added the trace_clock "perf" option that uses trace_clock(). Now the ftrace buffers can use the same clock as perf uses. This will be useful when perf starts reading the ftrace buffers, and will be able to interleave them with the same clock data. Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8eabfbb8003e..7f0e7fa6d62c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -648,6 +648,7 @@ static struct { { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, { trace_clock_jiffies, "uptime", 1 }, + { trace_clock, "perf", 1 }, ARCH_TRACE_CLOCKS }; -- cgit v1.2.3 From 6c43e554a2a5c1f2caf1733d46719bc58de3e37b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 15 Mar 2013 11:32:53 -0400 Subject: ring-buffer: Add ring buffer startup selftest When testing my large changes to the ftrace system, there was a bug that looked like the ring buffer was dropping events. I wrote up a quick integrity checker of the ring buffer to see if it was. Although the bug ended up being something stupid I did in ftrace, and had nothing to do with the ring buffer, I figured if I spent the time to write up this test, I might as well include it in the kernel. I cleaned it up a bit, as the original version was rather ugly. Not saying this version is pretty, but it's a beauty queen compared to what I original wrote. To enable the start up test, set CONFIG_RING_BUFFER_STARTUP_TEST. Note, it runs for 10 seconds, so it will slow your boot time by at least 10 more seconds. What it does is documented in both the comments and the Kconfig help. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/Kconfig | 23 ++++ kernel/trace/ring_buffer.c | 319 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 342 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f78eab251897..0b5ecf5517fa 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -565,6 +565,29 @@ config RING_BUFFER_BENCHMARK If unsure, say N. +config RING_BUFFER_STARTUP_TEST + bool "Ring buffer startup self test" + depends on RING_BUFFER + help + Run a simple self test on the ring buffer on boot up. Late in the + kernel boot sequence, the test will start that kicks off + a thread per cpu. Each thread will write various size events + into the ring buffer. Another thread is created to send IPIs + to each of the threads, where the IPI handler will also write + to the ring buffer, to test/stress the nesting ability. + If any anomalies are discovered, a warning will be displayed + and all ring buffers will be disabled. + + The test runs for 10 seconds. This will slow your boot time + by at least 10 more seconds. + + At the end of the test, statics and more checks are done. + It will output the stats of each per cpu buffer. What + was written, the sizes, what was read, what was lost, and + other similar details. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d1c85c5f5f51..e5472f7bc347 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -12,10 +12,12 @@ #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/kthread.h> /* for self test */ #include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> @@ -4634,3 +4636,320 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { + struct ring_buffer *buffer; + unsigned long events; + unsigned long bytes_written; + unsigned long bytes_alloc; + unsigned long bytes_dropped; + unsigned long events_nested; + unsigned long bytes_written_nested; + unsigned long bytes_alloc_nested; + unsigned long bytes_dropped_nested; + int min_size_nested; + int max_size_nested; + int max_size; + int min_size; + int cpu; + int cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE 1048576 + +static char rb_string[] __initdata = + "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" + "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" + "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { + int size; + char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ + struct ring_buffer_event *event; + struct rb_item *item; + bool started; + int event_len; + int size; + int len; + int cnt; + + /* Have nested writes different that what is written */ + cnt = data->cnt + (nested ? 27 : 0); + + /* Multiply cnt by ~e, to make some unique increment */ + size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + + len = size + sizeof(struct rb_item); + + started = rb_test_started; + /* read rb_test_started before checking buffer enabled */ + smp_rmb(); + + event = ring_buffer_lock_reserve(data->buffer, len); + if (!event) { + /* Ignore dropped events before test starts. */ + if (started) { + if (nested) + data->bytes_dropped += len; + else + data->bytes_dropped_nested += len; + } + return len; + } + + event_len = ring_buffer_event_length(event); + + if (RB_WARN_ON(data->buffer, event_len < len)) + goto out; + + item = ring_buffer_event_data(event); + item->size = size; + memcpy(item->str, rb_string, size); + + if (nested) { + data->bytes_alloc_nested += event_len; + data->bytes_written_nested += len; + data->events_nested++; + if (!data->min_size_nested || len < data->min_size_nested) + data->min_size_nested = len; + if (len > data->max_size_nested) + data->max_size_nested = len; + } else { + data->bytes_alloc += event_len; + data->bytes_written += len; + data->events++; + if (!data->min_size || len < data->min_size) + data->max_size = len; + if (len > data->max_size) + data->max_size = len; + } + + out: + ring_buffer_unlock_commit(data->buffer, event); + + return 0; +} + +static __init int rb_test(void *arg) +{ + struct rb_test_data *data = arg; + + while (!kthread_should_stop()) { + rb_write_something(data, false); + data->cnt++; + + set_current_state(TASK_INTERRUPTIBLE); + /* Now sleep between a min of 100-300us and a max of 1ms */ + usleep_range(((data->cnt % 3) + 1) * 100, 1000); + } + + return 0; +} + +static __init void rb_ipi(void *ignore) +{ + struct rb_test_data *data; + int cpu = smp_processor_id(); + + data = &rb_data[cpu]; + rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ + while (!kthread_should_stop()) { + + /* Send an IPI to all cpus to write data! */ + smp_call_function(rb_ipi, NULL, 1); + /* No sleep, but for non preempt, let others run */ + schedule(); + } + + return 0; +} + +static __init int test_ringbuffer(void) +{ + struct task_struct *rb_hammer; + struct ring_buffer *buffer; + int cpu; + int ret = 0; + + pr_info("Running ring buffer tests...\n"); + + buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); + if (WARN_ON(!buffer)) + return 0; + + /* Disable buffer so that threads can't write to it yet */ + ring_buffer_record_off(buffer); + + for_each_online_cpu(cpu) { + rb_data[cpu].buffer = buffer; + rb_data[cpu].cpu = cpu; + rb_data[cpu].cnt = cpu; + rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], + "rbtester/%d", cpu); + if (WARN_ON(!rb_threads[cpu])) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + kthread_bind(rb_threads[cpu], cpu); + wake_up_process(rb_threads[cpu]); + } + + /* Now create the rb hammer! */ + rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); + if (WARN_ON(!rb_hammer)) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + ring_buffer_record_on(buffer); + /* + * Show buffer is enabled before setting rb_test_started. + * Yes there's a small race window where events could be + * dropped and the thread wont catch it. But when a ring + * buffer gets enabled, there will always be some kind of + * delay before other CPUs see it. Thus, we don't care about + * those dropped events. We care about events dropped after + * the threads see that the buffer is active. + */ + smp_wmb(); + rb_test_started = true; + + set_current_state(TASK_INTERRUPTIBLE); + /* Just run for 10 seconds */; + schedule_timeout(10 * HZ); + + kthread_stop(rb_hammer); + + out_free: + for_each_online_cpu(cpu) { + if (!rb_threads[cpu]) + break; + kthread_stop(rb_threads[cpu]); + } + if (ret) { + ring_buffer_free(buffer); + return ret; + } + + /* Report! */ + pr_info("finished\n"); + for_each_online_cpu(cpu) { + struct ring_buffer_event *event; + struct rb_test_data *data = &rb_data[cpu]; + struct rb_item *item; + unsigned long total_events; + unsigned long total_dropped; + unsigned long total_written; + unsigned long total_alloc; + unsigned long total_read = 0; + unsigned long total_size = 0; + unsigned long total_len = 0; + unsigned long total_lost = 0; + unsigned long lost; + int big_event_size; + int small_event_size; + + ret = -1; + + total_events = data->events + data->events_nested; + total_written = data->bytes_written + data->bytes_written_nested; + total_alloc = data->bytes_alloc + data->bytes_alloc_nested; + total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + + big_event_size = data->max_size + data->max_size_nested; + small_event_size = data->min_size + data->min_size_nested; + + pr_info("CPU %d:\n", cpu); + pr_info(" events: %ld\n", total_events); + pr_info(" dropped bytes: %ld\n", total_dropped); + pr_info(" alloced bytes: %ld\n", total_alloc); + pr_info(" written bytes: %ld\n", total_written); + pr_info(" biggest event: %d\n", big_event_size); + pr_info(" smallest event: %d\n", small_event_size); + + if (RB_WARN_ON(buffer, total_dropped)) + break; + + ret = 0; + + while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { + total_lost += lost; + item = ring_buffer_event_data(event); + total_len += ring_buffer_event_length(event); + total_size += item->size + sizeof(struct rb_item); + if (memcmp(&item->str[0], rb_string, item->size) != 0) { + pr_info("FAILED!\n"); + pr_info("buffer had: %.*s\n", item->size, item->str); + pr_info("expected: %.*s\n", item->size, rb_string); + RB_WARN_ON(buffer, 1); + ret = -1; + break; + } + total_read++; + } + if (ret) + break; + + ret = -1; + + pr_info(" read events: %ld\n", total_read); + pr_info(" lost events: %ld\n", total_lost); + pr_info(" total events: %ld\n", total_lost + total_read); + pr_info(" recorded len bytes: %ld\n", total_len); + pr_info(" recorded size bytes: %ld\n", total_size); + if (total_lost) + pr_info(" With dropped events, record len and size may not match\n" + " alloced and written from above\n"); + if (!total_lost) { + if (RB_WARN_ON(buffer, total_len != total_alloc || + total_size != total_written)) + break; + } + if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) + break; + + ret = 0; + } + if (!ret) + pr_info("Ring buffer PASSED!\n"); + + ring_buffer_free(buffer); + return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ -- cgit v1.2.3 From 687c878afb526a0c3117dbc408ca76ad80d689f7 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 11 Mar 2013 15:13:29 +0800 Subject: tracing: Use pr_warn_once instead of open coded implementation Use pr_warn_once, instead of making an open coded implementation. Link: http://lkml.kernel.org/r/513D8419.20400@huawei.com Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7f0e7fa6d62c..bba1ba958ee8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5205,8 +5205,6 @@ static inline int register_snapshot_cmd(void) { return 0; } struct dentry *tracing_init_dentry_tr(struct trace_array *tr) { - static int once; - if (tr->dir) return tr->dir; @@ -5216,11 +5214,8 @@ struct dentry *tracing_init_dentry_tr(struct trace_array *tr) if (tr->flags & TRACE_ARRAY_FL_GLOBAL) tr->dir = debugfs_create_dir("tracing", NULL); - if (!tr->dir && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'tracing'\n"); - return NULL; - } + if (!tr->dir) + pr_warn_once("Could not create debugfs directory 'tracing'\n"); return tr->dir; } -- cgit v1.2.3 From bd6df18716fa45bc4aa9587aca033de909e5382b Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 11 Mar 2013 15:13:37 +0800 Subject: tracing: Use TRACE_MAX_PRINT instead of constant TRACE_MAX_PRINT macro is defined, but is not used. Link: http://lkml.kernel.org/r/513D8421.4070404@huawei.com Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bba1ba958ee8..848625674752 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5978,8 +5978,8 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= 1000) - s->len = 1000; + if (s->len >= TRACE_MAX_PRINT) + s->len = TRACE_MAX_PRINT; /* should be zero ended, but we are paranoid. */ s->buffer[s->len] = 0; -- cgit v1.2.3 From b3a8c6fd7bb61c910bd4f80ae1d75056e8f98c19 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 11 Mar 2013 15:13:42 +0800 Subject: tracing: Move find_event_field() into trace_events.c By moving find_event_field() and trace_find_field() into trace_events.c, the ftrace_common_fields list and trace_get_fields() can become local to the trace_events.c file. find_event_field() is renamed to trace_find_event_field() to conform to the tracing global function names. Link: http://lkml.kernel.org/r/513D8426.9070109@huawei.com Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> [ rostedt: Modified trace_find_field() to trace_find_event_field() ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.h | 6 ++---- kernel/trace/trace_events.c | 31 +++++++++++++++++++++++++++++-- kernel/trace/trace_events_filter.c | 29 +---------------------------- 3 files changed, 32 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 5cc52361bc9f..9e014582e763 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -995,8 +995,6 @@ struct filter_pred { unsigned short right; }; -extern struct list_head ftrace_common_fields; - extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, @@ -1009,8 +1007,8 @@ extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c636523b1a59..ba523d7beea2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -34,7 +34,7 @@ char event_storage[EVENT_STORAGE_SIZE]; EXPORT_SYMBOL_GPL(event_storage); LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); #define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) @@ -54,7 +54,7 @@ static struct kmem_cache *file_cachep; #define while_for_each_event_file() \ } -struct list_head * +static struct list_head * trace_get_fields(struct ftrace_event_call *event_call) { if (!event_call->class->get_fields) @@ -62,6 +62,33 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 2a22a177ab44..a6361178de5a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ - struct ftrace_event_field *field; - - list_for_each_entry(field, head, link) { - if (!strcmp(field->name, name)) - return field; - } - - return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ - struct ftrace_event_field *field; - struct list_head *head; - - field = __find_event_field(&ftrace_common_fields, name); - if (field) - return field; - - head = trace_get_fields(call); - return __find_event_field(head, name); -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); @@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, return NULL; } - field = find_event_field(call, operand1); + field = trace_find_event_field(call, operand1); if (!field) { parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; -- cgit v1.2.3 From ad7067cebf3253412a7c0a169a9dd056b11e69ac Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 11 Mar 2013 15:13:46 +0800 Subject: tracing: Convert trace_destroy_fields() to static trace_destroy_fields() is not used outside of the file. It can be a static function. Link: http://lkml.kernel.org/r/513D842A.2000907@huawei.com Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ba523d7beea2..a71cdc3c5df9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -158,7 +158,7 @@ static int trace_define_common_fields(void) return ret; } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; -- cgit v1.2.3 From 36a78e9e8792bfb052643eaf9374f837e634982c Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 11 Mar 2013 15:13:51 +0800 Subject: tracing: Fix comment about prefix in arch_syscall_match_sym_name() ppc64 has its own syscall prefix like ".SyS" or ".sys". Make the comment in arch_syscall_match_sym_name() more understandable. Link: http://lkml.kernel.org/r/513D842F.40205@huawei.com Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_syscalls.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 68f3f344be65..8f2ac73c7a5f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -37,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name /* * Only compare after the "sys" prefix. Archs that use * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted + * with ".SyS" or ".sys" instead of "sys", leading to an unwanted * mismatch. */ return !strcmp(sym + 3, name + 3); -- cgit v1.2.3 From 52f6ad6dc3f4c6de598fe7cc9b629888d624aa52 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 11 Mar 2013 15:14:03 +0800 Subject: tracing: Rename trace_event_mutex to trace_event_sem trace_event_mutex is an rw semaphore now, not a mutex, change the name. Link: http://lkml.kernel.org/r/513D843B.40109@huawei.com Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> [ Forward ported to my new code ] Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace_events.c | 22 +++++++++++----------- kernel/trace/trace_output.c | 16 ++++++++-------- kernel/trace/trace_output.h | 2 +- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a71cdc3c5df9..53582e982e51 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1584,7 +1584,7 @@ int trace_add_event_call(struct ftrace_event_call *call) } /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Must be called under locking both of event_mutex and trace_event_sem. */ static void __trace_remove_event_call(struct ftrace_event_call *call) { @@ -1597,9 +1597,9 @@ static void __trace_remove_event_call(struct ftrace_event_call *call) void trace_remove_event_call(struct ftrace_event_call *call) { mutex_lock(&event_mutex); - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __trace_remove_event_call(call); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); mutex_unlock(&event_mutex); } @@ -1707,7 +1707,7 @@ static void trace_module_remove_events(struct module *mod) struct ftrace_event_call *call, *p; bool clear_trace = false; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) @@ -1725,7 +1725,7 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } - up_write(&trace_event_mutex); + up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded @@ -2262,9 +2262,9 @@ int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) if (ret) goto out_unlock; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __trace_add_event_dirs(tr); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); out_unlock: mutex_unlock(&event_mutex); @@ -2287,9 +2287,9 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr) if (ret) goto out_unlock; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __trace_early_add_event_dirs(tr); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); out_unlock: mutex_unlock(&event_mutex); @@ -2304,10 +2304,10 @@ int event_trace_del_tracer(struct trace_array *tr) mutex_lock(&event_mutex); - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __trace_remove_event_dirs(tr); debugfs_remove_recursive(tr->event_dir); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); tr->event_dir = NULL; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 19f48e7edc39..f475b2a7ac88 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -826,12 +826,12 @@ static int trace_search_list(struct list_head **list) void trace_event_read_lock(void) { - down_read(&trace_event_mutex); + down_read(&trace_event_sem); } void trace_event_read_unlock(void) { - up_read(&trace_event_mutex); + up_read(&trace_event_sem); } /** @@ -854,7 +854,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); if (WARN_ON(!event)) goto out; @@ -909,14 +909,14 @@ int register_ftrace_event(struct trace_event *event) ret = event->type; out: - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write. */ int __unregister_ftrace_event(struct trace_event *event) { @@ -931,9 +931,9 @@ int __unregister_ftrace_event(struct trace_event *event) */ int unregister_ftrace_event(struct trace_event *event) { - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __unregister_ftrace_event(event); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return 0; } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index af77870de278..127a9d8c8357 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -33,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem; #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) -- cgit v1.2.3 From 7fe70b579c9e3daba71635e31b6189394e7b79d3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 15 Mar 2013 13:10:35 -0400 Subject: tracing: Fix ftrace_dump() ftrace_dump() had a lot of issues. What ftrace_dump() does, is when ftrace_dump_on_oops is set (via a kernel parameter or sysctl), it will dump out the ftrace buffers to the console when either a oops, panic, or a sysrq-z occurs. This was written a long time ago when ftrace was fragile to recursion. But it wasn't written well even for that. There's a possible deadlock that can occur if a ftrace_dump() is happening and an NMI triggers another dump. This is because it grabs a lock before checking if the dump ran. It also totally disables ftrace, and tracing for no good reasons. As the ring_buffer now checks if it is read via a oops or NMI, where there's a chance that the buffer gets corrupted, it will disable itself. No need to have ftrace_dump() do the same. ftrace_dump() is now cleaned up where it uses an atomic counter to make sure only one dump happens at a time. A simple atomic_inc_return() is enough that is needed for both other CPUs and NMIs. No need for a spinlock, as if one CPU is running the dump, no other CPU needs to do it too. The tracing_on variable is turned off and not turned on. The original code did this, but it wasn't pretty. By just disabling this variable we get the result of not seeing traces that happen between crashes. For sysrq-z, it doesn't get turned on, but the user can always write a '1' to the tracing_on file. If they are using sysrq-z, then they should know about tracing_on. The new code is much easier to read and less error prone. No more deadlock possibility when an NMI triggers here. Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Cc: stable@vger.kernel.org Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Frederic Weisbecker <fweisbec@gmail.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 62 ++++++++++++++++++------------------------- kernel/trace/trace_selftest.c | 9 ++++--- 2 files changed, 31 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 848625674752..3dc7999594e1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5997,36 +5997,32 @@ void trace_init_global_iter(struct trace_iterator *iter) iter->trace_buffer = &global_trace.trace_buffer; } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static atomic_t dump_running; unsigned int old_userobj; - static int dump_ran; unsigned long flags; int cnt = 0, cpu; - /* only one dump */ - local_irq_save(flags); - arch_spin_lock(&ftrace_dump_lock); - if (dump_ran) - goto out; - - dump_ran = 1; + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ tracing_off(); - /* Did function tracer already get disabled? */ - if (ftrace_is_dead()) { - printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - printk("# MAY BE MISSING FUNCTION EVENTS\n"); - } - - if (disable_tracing) - ftrace_kill(); + local_irq_save(flags); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -6056,6 +6052,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "Dumping ftrace buffer:\n"); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + /* * We need to stop all tracing on all CPUS to read the * the next buffer. This is a bit expensive, but is @@ -6095,26 +6097,14 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - /* Re-enable tracing if requested */ - if (!disable_tracing) { - trace_flags |= old_userobj; + trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); - } - tracing_on(); + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - - out: - arch_spin_unlock(&ftrace_dump_lock); + atomic_dec(&dump_running); local_irq_restore(flags); } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ - __ftrace_dump(true, oops_dump_mode); -} EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 8672c40cb153..55e2cf66967b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) - __ftrace_dump(false, DUMP_ALL); + if (ftrace_dump_on_oops) { + ftrace_dump(DUMP_ALL); + /* ftrace_dump() disables tracing */ + tracing_on(); + } return 0; } -- cgit v1.2.3 From 778141e3cf0bf29f91cd3cb5c314ea477b9402a7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Mon, 18 Mar 2013 11:41:46 +0900 Subject: perf: Reset hwc->last_period on sw clock events When cpu/task clock events are initialized, their sampling frequencies are converted to have a fixed value. However it missed to update the hwc->last_period which was set to 1 for initial sampling frequency calibration. Because this hwc->last_period value is used as a period in perf_swevent_ hrtime(), every recorded sample will have an incorrected period of 1. $ perf record -e task-clock noploop 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.158 MB perf.data (~6919 samples) ] $ perf report -n --show-total-period --stdio # Samples: 4K of event 'task-clock' # Event count (approx.): 4000 # # Overhead Samples Period Command Shared Object Symbol # ........ ............ ............ ....... ............. .................. # 99.95% 3998 3998 noploop noploop [.] main 0.03% 1 1 noploop libc-2.15.so [.] init_cacheinfo 0.03% 1 1 noploop ld-2.15.so [.] open_verify Note that it doesn't affect the non-sampling event so that the perf stat still gets correct value with or without this patch. $ perf stat -e task-clock noploop 1 Performance counter stats for 'noploop 1': 1000.272525 task-clock # 1.000 CPUs utilized 1.000560605 seconds time elapsed Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1363574507-18808-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c30..fa79c377d65d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5647,6 +5647,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } -- cgit v1.2.3 From d610d98b5de6860feb21539726e9af7c9094151c Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Fri, 15 Mar 2013 16:27:13 +0900 Subject: perf: Generate EXIT event only once per task context perf_event_task_event() iterates pmu list and generate events for each eligible pmu context. But if task_event has task_ctx like in EXIT it'll generate events even though the pmu doesn't have an eligible one. Fix it by moving the code to proper places. Before this patch: $ perf record -n true [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.006 MB perf.data (~248 samples) ] $ perf report -D | tail Aggregated stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 cycles stats: TOTAL events: 73 MMAP events: 67 COMM events: 2 EXIT events: 4 After this patch: $ perf report -D | tail Aggregated stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 cycles stats: TOTAL events: 70 MMAP events: 67 COMM events: 2 EXIT events: 1 Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/1363332433-7637-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/core.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index fa79c377d65d..59412d037eed 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4434,12 +4434,15 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (ctx) - perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } + if (task_event->task_ctx) + perf_event_task_ctx(task_event->task_ctx, task_event); + rcu_read_unlock(); } -- cgit v1.2.3 From 86e213e1d901fbeaf6e57d13c5edd925fadddcbe Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Mon, 18 Mar 2013 18:56:34 +0900 Subject: perf/cgroup: Add __percpu annotation to perf_cgroup->info It's a per-cpu data structure but missed the __percpu annotation. Signed-off-by: Namhyung Kim <namhyung@kernel.org> Cc: Tejun Heo <tj@kernel.org> Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Cc: Namhyung Kim <namhyung.kim@lge.com> Link: http://lkml.kernel.org/r/1363600594-11453-1-git-send-email-namhyung@kernel.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5976a2a6b4ce..efb75b3a69ad 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -245,7 +245,7 @@ struct perf_cgroup_info { struct perf_cgroup { struct cgroup_subsys_state css; - struct perf_cgroup_info *info; + struct perf_cgroup_info __percpu *info; }; /* -- cgit v1.2.3 From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 19 Mar 2013 13:45:20 -0700 Subject: sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY PF_THREAD_BOUND was originally used to mark kernel threads which were bound to a specific CPU using kthread_bind() and a task with the flag set allows cpus_allowed modifications only to itself. Workqueue is currently abusing it to prevent userland from meddling with cpus_allowed of workqueue workers. What we need is a flag to prevent userland from messing with cpus_allowed of certain kernel tasks. In kernel, anyone can (incorrectly) squash the flag, and, for worker-type usages, restricting cpus_allowed modification to the task itself doesn't provide meaningful extra proection as other tasks can inject work items to the task anyway. This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY. sched_setaffinity() checks the flag and return -EINVAL if set. set_cpus_allowed_ptr() is no longer affected by the flag. This will allow simplifying workqueue worker CPU affinity management. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Ingo Molnar <mingo@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> --- include/linux/sched.h | 2 +- kernel/cgroup.c | 4 ++-- kernel/cpuset.c | 16 ++++++++-------- kernel/kthread.c | 2 +- kernel/sched/core.c | 9 ++++----- kernel/workqueue.c | 10 +++------- 6 files changed, 19 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..e5c64f7b8c1d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ -#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */ +#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..3852d926322c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2224,11 +2224,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..f22e94792707 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1388,16 +1388,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) cgroup_taskset_for_each(task, cgrp, tset) { /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ ret = -EINVAL; - if (task->flags & PF_THREAD_BOUND) + if (task->flags & PF_NO_SETAFFINITY) goto out_unlock; ret = security_task_setscheduler(task); if (ret) diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..a2fbbb782bad 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -260,7 +260,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) { /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + p->flags |= PF_NO_SETAFFINITY; } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..23606ee961b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 969be0b72071..39a591f65b08 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1757,12 +1757,8 @@ static struct worker *create_worker(struct worker_pool *pool) set_user_nice(worker->task, pool->attrs->nice); set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* - * %PF_THREAD_BOUND is used to prevent userland from meddling with - * cpumask of workqueue workers. This is an abuse. We need - * %PF_NO_SETAFFINITY. - */ - worker->task->flags |= PF_THREAD_BOUND; + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; /* * The caller is responsible for ensuring %POOL_DISASSOCIATED @@ -3876,7 +3872,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, } wq->rescuer = rescuer; - rescuer->task->flags |= PF_THREAD_BOUND; + rescuer->task->flags |= PF_NO_SETAFFINITY; wake_up_process(rescuer->task); } -- cgit v1.2.3 From 822d8405d13931062d653e0c2cc0199ed801b072 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: workqueue: convert worker_pool->worker_ida to idr and implement for_each_pool_worker() Make worker_ida an idr - worker_idr and use it to implement for_each_pool_worker() which will be used to simplify worker rebinding on CPU_ONLINE. pool->worker_idr is protected by both pool->manager_mutex and pool->lock so that it can be iterated while holding either lock. * create_worker() allocates ID without installing worker pointer and installs the pointer later using idr_replace(). This is because worker ID is needed when creating the actual task to name it and the new worker shouldn't be visible to iterations before fully initialized. * In destroy_worker(), ID removal is moved before kthread_stop(). This is again to guarantee that only fully working workers are visible to for_each_pool_worker(). Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 63 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 39a591f65b08..384ff34c9aff 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -119,6 +119,9 @@ enum { * * F: wq->flush_mutex protected. * + * MG: pool->manager_mutex and pool->lock protected. Writes require both + * locks. Reads can happen under either lock. + * * WQ: wq_mutex protected. * * WR: wq_mutex protected for writes. Sched-RCU protected for reads. @@ -156,7 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ struct mutex manager_mutex; /* manager exclusion */ - struct ida worker_ida; /* L: for worker IDs */ + struct idr worker_idr; /* MG: worker IDs and iteration */ struct workqueue_attrs *attrs; /* I: worker attributes */ struct hlist_node hash_node; /* WQ: unbound_pool_hash node */ @@ -299,6 +302,15 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&pwq_lock), \ "sched RCU or pwq_lock should be held") +#ifdef CONFIG_LOCKDEP +#define assert_manager_or_pool_lock(pool) \ + WARN_ONCE(!lockdep_is_held(&(pool)->manager_mutex) && \ + !lockdep_is_held(&(pool)->lock), \ + "pool->manager_mutex or ->lock should be held") +#else +#define assert_manager_or_pool_lock(pool) do { } while (0) +#endif + #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -324,6 +336,22 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, if (({ assert_rcu_or_wq_mutex(); false; })) { } \ else +/** + * for_each_pool_worker - iterate through all workers of a worker_pool + * @worker: iteration cursor + * @wi: integer used for iteration + * @pool: worker_pool to iterate workers of + * + * This must be called with either @pool->manager_mutex or ->lock held. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pool_worker(worker, wi, pool) \ + idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ + if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ + else + /** * for_each_pwq - iterate through all pool_workqueues of the specified workqueue * @pwq: iteration cursor @@ -1723,14 +1751,19 @@ static struct worker *create_worker(struct worker_pool *pool) lockdep_assert_held(&pool->manager_mutex); + /* + * ID is needed to determine kthread name. Allocate ID first + * without installing the pointer. + */ + idr_preload(GFP_KERNEL); spin_lock_irq(&pool->lock); - while (ida_get_new(&pool->worker_ida, &id)) { - spin_unlock_irq(&pool->lock); - if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) - goto fail; - spin_lock_irq(&pool->lock); - } + + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); + spin_unlock_irq(&pool->lock); + idr_preload_end(); + if (id < 0) + goto fail; worker = alloc_worker(); if (!worker) @@ -1768,11 +1801,17 @@ static struct worker *create_worker(struct worker_pool *pool) if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; + /* successful, commit the pointer to idr */ + spin_lock_irq(&pool->lock); + idr_replace(&pool->worker_idr, worker, worker->id); + spin_unlock_irq(&pool->lock); + return worker; + fail: if (id >= 0) { spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); + idr_remove(&pool->worker_idr, id); spin_unlock_irq(&pool->lock); } kfree(worker); @@ -1832,7 +1871,6 @@ static int create_and_start_worker(struct worker_pool *pool) static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - int id = worker->id; lockdep_assert_held(&pool->manager_mutex); lockdep_assert_held(&pool->lock); @@ -1850,13 +1888,14 @@ static void destroy_worker(struct worker *worker) list_del_init(&worker->entry); worker->flags |= WORKER_DIE; + idr_remove(&pool->worker_idr, worker->id); + spin_unlock_irq(&pool->lock); kthread_stop(worker->task); kfree(worker); spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __pool) @@ -3482,7 +3521,7 @@ static int init_worker_pool(struct worker_pool *pool) mutex_init(&pool->manager_arb); mutex_init(&pool->manager_mutex); - ida_init(&pool->worker_ida); + idr_init(&pool->worker_idr); INIT_HLIST_NODE(&pool->hash_node); pool->refcnt = 1; @@ -3498,7 +3537,7 @@ static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - ida_destroy(&pool->worker_ida); + idr_destroy(&pool->worker_idr); free_workqueue_attrs(pool->attrs); kfree(pool); } -- cgit v1.2.3 From bd7c089eb25b26d2e03fd34f97e5517a4463f871 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: workqueue: relocate rebind_workers() rebind_workers() will be reimplemented in a way which makes it mostly decoupled from the rest of worker management. Move rebind_workers() so that it's located with other CPU hotplug related functions. This patch is pure function relocation. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 142 ++++++++++++++++++++++++++--------------------------- 1 file changed, 71 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 384ff34c9aff..3e297c574be8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1643,77 +1643,6 @@ static void busy_worker_rebind_fn(struct work_struct *work) spin_unlock_irq(&worker->pool->lock); } -/** - * rebind_workers - rebind all workers of a pool to the associated CPU - * @pool: pool of interest - * - * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up. They will - * add themselves back after completing rebind. This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough. Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case. Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. - */ -static void rebind_workers(struct worker_pool *pool) -{ - struct worker *worker, *n; - int i; - - lockdep_assert_held(&pool->manager_mutex); - lockdep_assert_held(&pool->lock); - - /* dequeue and kick idle ones */ - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list until rebind - * is complete to avoid receiving premature local wake-ups. - */ - list_del_init(&worker->entry); - - /* - * worker_thread() will see the above dequeuing and call - * idle_worker_rebind(). - */ - wake_up_process(worker->task); - } - - /* rebind busy workers */ - for_each_busy_worker(worker, i, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; - - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - - /* - * wq doesn't really matter but let's keep @worker->pool - * and @pwq->pool consistent for sanity. - */ - if (worker->pool->attrs->nice < 0) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -4196,6 +4125,77 @@ static void wq_unbind_fn(struct work_struct *work) atomic_set(&pool->nr_running, 0); } +/** + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest + * + * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding + * is different for idle and busy ones. + * + * Idle ones will be removed from the idle_list and woken up. They will + * add themselves back after completing rebind. This ensures that the + * idle_list doesn't contain any unbound workers when re-bound busy workers + * try to perform local wake-ups for concurrency management. + * + * Busy workers can rebind after they finish their current work items. + * Queueing the rebind work item at the head of the scheduled list is + * enough. Note that nr_running will be properly bumped as busy workers + * rebind. + * + * On return, all non-manager workers are scheduled for rebind - see + * manage_workers() for the manager special case. Any idle worker + * including the manager will not appear on @idle_list until rebind is + * complete, making local wake-ups safe. + */ +static void rebind_workers(struct worker_pool *pool) +{ + struct worker *worker, *n; + int i; + + lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->lock); + + /* dequeue and kick idle ones */ + list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { + /* + * idle workers should be off @pool->idle_list until rebind + * is complete to avoid receiving premature local wake-ups. + */ + list_del_init(&worker->entry); + + /* + * worker_thread() will see the above dequeuing and call + * idle_worker_rebind(). + */ + wake_up_process(worker->task); + } + + /* rebind busy workers */ + for_each_busy_worker(worker, i, pool) { + struct work_struct *rebind_work = &worker->rebind_work; + struct workqueue_struct *wq; + + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + + /* + * wq doesn't really matter but let's keep @worker->pool + * and @pwq->pool consistent for sanity. + */ + if (worker->pool->attrs->nice < 0) + wq = system_highpri_wq; + else + wq = system_wq; + + insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } +} + /* * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. -- cgit v1.2.3 From a9ab775bcadf122d91e1a201eb66ae2eec90365a Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: workqueue: directly restore CPU affinity of workers from CPU_ONLINE Rebinding workers of a per-cpu pool after a CPU comes online involves a lot of back-and-forth mostly because only the task itself could adjust CPU affinity if PF_THREAD_BOUND was set. As CPU_ONLINE itself couldn't adjust affinity, it had to somehow coerce the workers themselves to perform set_cpus_allowed_ptr(). Due to the various states a worker can be in, this led to three different paths a worker may be rebound. worker->rebind_work is queued to busy workers. Idle ones are signaled by unlinking worker->entry and call idle_worker_rebind(). The manager isn't covered by either and implements its own mechanism. PF_THREAD_BOUND has been relaced with PF_NO_SETAFFINITY and CPU_ONLINE itself now can manipulate CPU affinity of workers. This patch replaces the existing rebind mechanism with direct one where CPU_ONLINE iterates over all workers using for_each_pool_worker(), restores CPU affinity, and clears WORKER_UNBOUND. There are a couple subtleties. All bound idle workers should have their runqueues set to that of the bound CPU; however, if the target task isn't running, set_cpus_allowed_ptr() just updates the cpus_allowed mask deferring the actual migration to when the task wakes up. This is worked around by waking up idle workers after restoring CPU affinity before any workers can become bound. Another subtlety is stems from matching @pool->nr_running with the number of running unbound workers. While DISASSOCIATED, all workers are unbound and nr_running is zero. As workers become bound again, nr_running needs to be adjusted accordingly; however, there is no good way to tell whether a given worker is running without poking into scheduler internals. Instead of clearing UNBOUND directly, rebind_workers() replaces UNBOUND with another new NOT_RUNNING flag - REBOUND, which will later be cleared by the workers themselves while preparing for the next round of work item execution. The only change needed for the workers is clearing REBOUND along with PREP. * This patch leaves for_each_busy_worker() without any user. Removed. * idle_worker_rebind(), busy_worker_rebind_fn(), worker->rebind_work and rebind logic in manager_workers() removed. * worker_thread() now looks at WORKER_DIE instead of testing whether @worker->entry is empty to determine whether it needs to do something special as dying is the only special thing now. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 192 +++++++++++++++----------------------------- kernel/workqueue_internal.h | 3 - 2 files changed, 64 insertions(+), 131 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3e297c574be8..9508b5ed7336 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -75,9 +75,10 @@ enum { WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + WORKER_REBOUND = 1 << 8, /* worker was rebound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | - WORKER_CPU_INTENSIVE, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | + WORKER_UNBOUND | WORKER_REBOUND, NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ @@ -316,9 +317,6 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ (pool)++) -#define for_each_busy_worker(worker, i, pool) \ - hash_for_each(pool->busy_hash, i, worker, hentry) - /** * for_each_pool - iterate through all worker_pools in the system * @pool: iteration cursor @@ -1612,37 +1610,6 @@ __acquires(&pool->lock) } } -/* - * Rebind an idle @worker to its CPU. worker_thread() will test - * list_empty(@worker->entry) before leaving idle and call this function. - */ -static void idle_worker_rebind(struct worker *worker) -{ - /* CPU may go down again inbetween, clear UNBOUND only on success */ - if (worker_maybe_bind_and_lock(worker->pool)) - worker_clr_flags(worker, WORKER_UNBOUND); - - /* rebind complete, become available again */ - list_add(&worker->entry, &worker->pool->idle_list); - spin_unlock_irq(&worker->pool->lock); -} - -/* - * Function for @worker->rebind.work used to rebind unbound busy workers to - * the associated cpu which is coming back online. This is scheduled by - * cpu up but can race with other cpu hotplug operations and may be - * executed twice without intervening cpu down. - */ -static void busy_worker_rebind_fn(struct work_struct *work) -{ - struct worker *worker = container_of(work, struct worker, rebind_work); - - if (worker_maybe_bind_and_lock(worker->pool)) - worker_clr_flags(worker, WORKER_UNBOUND); - - spin_unlock_irq(&worker->pool->lock); -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -1651,7 +1618,6 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -2053,22 +2019,6 @@ static bool manage_workers(struct worker *worker) if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); mutex_lock(&pool->manager_mutex); - /* - * CPU hotplug could have happened while we were waiting - * for assoc_mutex. Hotplug itself can't handle us - * because manager isn't either on idle or busy list, and - * @pool's state and ours could have deviated. - * - * As hotplug is now excluded via manager_mutex, we can - * simply try to bind. It will succeed or fail depending - * on @pool's current state. Try it and adjust - * %WORKER_UNBOUND accordingly. - */ - if (worker_maybe_bind_and_lock(pool)) - worker->flags &= ~WORKER_UNBOUND; - else - worker->flags |= WORKER_UNBOUND; - ret = true; } @@ -2252,19 +2202,12 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&pool->lock); - /* we are off idle list if destruction or rebind is requested */ - if (unlikely(list_empty(&worker->entry))) { + /* am I supposed to die? */ + if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); - - /* if DIE is set, destruction is requested */ - if (worker->flags & WORKER_DIE) { - worker->task->flags &= ~PF_WQ_WORKER; - return 0; - } - - /* otherwise, rebind */ - idle_worker_rebind(worker); - goto woke_up; + WARN_ON_ONCE(!list_empty(&worker->entry)); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; } worker_leave_idle(worker); @@ -2285,11 +2228,13 @@ recheck: WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* - * When control reaches this point, we're guaranteed to have - * at least one idle worker or that someone else has already - * assumed the manager role. + * Finish PREP stage. We're guaranteed to have at least one idle + * worker or that someone else has already assumed the manager + * role. This is where @worker starts participating in concurrency + * management if applicable and concurrency management is restored + * after being rebound. See rebind_workers() for details. */ - worker_clr_flags(worker, WORKER_PREP); + worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = @@ -4076,7 +4021,7 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int i; + int wi; for_each_cpu_worker_pool(pool, cpu) { WARN_ON_ONCE(cpu != smp_processor_id()); @@ -4091,10 +4036,7 @@ static void wq_unbind_fn(struct work_struct *work) * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags |= WORKER_UNBOUND; - - for_each_busy_worker(worker, i, pool) + for_each_pool_worker(worker, wi, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; @@ -4129,71 +4071,64 @@ static void wq_unbind_fn(struct work_struct *work) * rebind_workers - rebind all workers of a pool to the associated CPU * @pool: pool of interest * - * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up. They will - * add themselves back after completing rebind. This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough. Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case. Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. + * @pool->cpu is coming online. Rebind all workers to the CPU. */ static void rebind_workers(struct worker_pool *pool) { - struct worker *worker, *n; - int i; + struct worker *worker; + int wi; lockdep_assert_held(&pool->manager_mutex); - lockdep_assert_held(&pool->lock); - - /* dequeue and kick idle ones */ - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list until rebind - * is complete to avoid receiving premature local wake-ups. - */ - list_del_init(&worker->entry); - /* - * worker_thread() will see the above dequeuing and call - * idle_worker_rebind(). - */ - wake_up_process(worker->task); - } - - /* rebind busy workers */ - for_each_busy_worker(worker, i, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; + /* + * Restore CPU affinity of all workers. As all idle workers should + * be on the run-queue of the associated CPU before any local + * wake-ups for concurrency management happen, restore CPU affinty + * of all workers first and then clear UNBOUND. As we're called + * from CPU_ONLINE, the following shouldn't fail. + */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; + spin_lock_irq(&pool->lock); - debug_work_activate(rebind_work); + for_each_pool_worker(worker, wi, pool) { + unsigned int worker_flags = worker->flags; /* - * wq doesn't really matter but let's keep @worker->pool - * and @pwq->pool consistent for sanity. + * A bound idle worker should actually be on the runqueue + * of the associated CPU for local wake-ups targeting it to + * work. Kick all idle workers so that they migrate to the + * associated CPU. Doing this in the same loop as + * replacing UNBOUND with REBOUND is safe as no worker will + * be bound before @pool->lock is released. */ - if (worker->pool->attrs->nice < 0) - wq = system_highpri_wq; - else - wq = system_wq; + if (worker_flags & WORKER_IDLE) + wake_up_process(worker->task); - insert_work(per_cpu_ptr(wq->cpu_pwqs, pool->cpu), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); + /* + * We want to clear UNBOUND but can't directly call + * worker_clr_flags() or adjust nr_running. Atomically + * replace UNBOUND with another NOT_RUNNING flag REBOUND. + * @worker will clear REBOUND using worker_clr_flags() when + * it initiates the next execution cycle thus restoring + * concurrency management. Note that when or whether + * @worker clears REBOUND doesn't affect correctness. + * + * ACCESS_ONCE() is necessary because @worker->flags may be + * tested without holding any lock in + * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * fail incorrectly leading to premature concurrency + * management operations. + */ + WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); + worker_flags |= WORKER_REBOUND; + worker_flags &= ~WORKER_UNBOUND; + ACCESS_ONCE(worker->flags) = worker_flags; } + + spin_unlock_irq(&pool->lock); } /* @@ -4221,12 +4156,13 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_ONLINE: for_each_cpu_worker_pool(pool, cpu) { mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); + spin_lock_irq(&pool->lock); pool->flags &= ~POOL_DISASSOCIATED; + spin_unlock_irq(&pool->lock); + rebind_workers(pool); - spin_unlock_irq(&pool->lock); mutex_unlock(&pool->manager_mutex); } break; diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index f116f071d919..84ab6e1dc6fb 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -38,9 +38,6 @@ struct worker { unsigned int flags; /* X: flags */ int id; /* I: worker id */ - /* for rebinding worker to CPU */ - struct work_struct rebind_work; /* L: for busy worker */ - /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; -- cgit v1.2.3 From 7dbc725e4749d822eb6dc962526049af1586f041 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Tue, 19 Mar 2013 13:45:21 -0700 Subject: workqueue: restore CPU affinity of unbound workers on CPU_ONLINE With the recent addition of the custom attributes support, unbound pools may have allowed cpumask which isn't full. As long as some of CPUs in the cpumask are online, its workers will maintain cpus_allowed as set on worker creation; however, once no online CPU is left in cpus_allowed, the scheduler will reset cpus_allowed of any workers which get scheduled so that they can execute. To remain compliant to the user-specified configuration, CPU affinity needs to be restored when a CPU becomes online for an unbound pool which doesn't currently have any online CPUs before. This patch implement restore_unbound_workers_cpumask(), which is called from CPU_ONLINE for all unbound pools, checks whether the coming up CPU is the first allowed online one, and, if so, invokes set_cpus_allowed_ptr() with the configured cpumask on all workers. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9508b5ed7336..e38d035bf671 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4131,6 +4131,39 @@ static void rebind_workers(struct worker_pool *pool) spin_unlock_irq(&pool->lock); } +/** + * restore_unbound_workers_cpumask - restore cpumask of unbound workers + * @pool: unbound pool of interest + * @cpu: the CPU which is coming up + * + * An unbound pool may end up with a cpumask which doesn't have any online + * CPUs. When a worker of such pool get scheduled, the scheduler resets + * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any + * online CPU before, cpus_allowed of all its workers should be restored. + */ +static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) +{ + static cpumask_t cpumask; + struct worker *worker; + int wi; + + lockdep_assert_held(&pool->manager_mutex); + + /* is @cpu allowed for @pool? */ + if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) + return; + + /* is @cpu the only online CPU? */ + cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); + if (cpumask_weight(&cpumask) != 1) + return; + + /* as we're called from CPU_ONLINE, the following shouldn't fail */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); +} + /* * Workqueues should be brought up before normal priority CPU notifiers. * This will be registered high priority CPU notifier. @@ -4141,6 +4174,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, { int cpu = (unsigned long)hcpu; struct worker_pool *pool; + int pi; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: @@ -4154,17 +4188,25 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_cpu_worker_pool(pool, cpu) { + mutex_lock(&wq_mutex); + + for_each_pool(pool, pi) { mutex_lock(&pool->manager_mutex); - spin_lock_irq(&pool->lock); - pool->flags &= ~POOL_DISASSOCIATED; - spin_unlock_irq(&pool->lock); + if (pool->cpu == cpu) { + spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; + spin_unlock_irq(&pool->lock); - rebind_workers(pool); + rebind_workers(pool); + } else if (pool->cpu < 0) { + restore_unbound_workers_cpumask(pool, cpu); + } mutex_unlock(&pool->manager_mutex); } + + mutex_unlock(&wq_mutex); break; } return NOTIFY_OK; -- cgit v1.2.3 From 3ac1707a13a3da9cfc8f242a15b2fae6df2c5f88 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Tue, 12 Mar 2013 15:36:00 -0700 Subject: cgroup: fix an off-by-one bug which may trigger BUG_ON() The 3rd parameter of flex_array_prealloc() is the number of elements, not the index of the last element. The effect of the bug is, when opening cgroup.procs, a flex array will be allocated and all elements of the array is allocated with GFP_KERNEL flag, but the last one is GFP_ATOMIC, and if we fail to allocate memory for it, it'll trigger a BUG_ON(). Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> Cc: stable@vger.kernel.org --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7fe303b5109..54689fc008f6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2076,7 +2076,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (!group) return -ENOMEM; /* pre-allocate to guarantee space while iterating in rcu read-side. */ - retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); if (retval) goto out_free_group_list; -- cgit v1.2.3 From 081aa458c38ba576bdd4265fc807fa95b48b9e79 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Wed, 13 Mar 2013 09:17:09 +0800 Subject: cgroup: consolidate cgroup_attach_task() and cgroup_attach_proc() These two functions share most of the code. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 3 +- kernel/cgroup.c | 109 +++++++++---------------------------------------- kernel/cpuset.c | 2 +- 3 files changed, 23 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7e818a3ef60a..01c48c6806d6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -693,7 +693,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, struct cgroup_iter *it); void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); -int cgroup_attach_task(struct cgroup *, struct task_struct *); +int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 54689fc008f6..04fa2abf94b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -59,7 +59,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> -#include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/flex_array.h> /* used in cgroup_attach_task */ #include <linux/kthread.h> #include <linux/atomic.h> @@ -1943,82 +1943,6 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, put_css_set(oldcg); } -/** - * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' - * @cgrp: the cgroup the task is attaching to - * @tsk: the task to be attached - * - * Call with cgroup_mutex and threadgroup locked. May take task_lock of - * @tsk during call. - */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) -{ - int retval = 0; - struct cgroup_subsys *ss, *failed_ss = NULL; - struct cgroup *oldcgrp; - struct cgroupfs_root *root = cgrp->root; - struct cgroup_taskset tset = { }; - struct css_set *newcg; - - /* @tsk either already exited or can't exit until the end */ - if (tsk->flags & PF_EXITING) - return -ESRCH; - - /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup_from_root(tsk, root); - if (cgrp == oldcgrp) - return 0; - - tset.single.task = tsk; - tset.single.cgrp = oldcgrp; - - for_each_subsys(root, ss) { - if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); - if (retval) { - /* - * Remember on which subsystem the can_attach() - * failed, so that we only call cancel_attach() - * against the subsystems whose can_attach() - * succeeded. (See below) - */ - failed_ss = ss; - goto out; - } - } - } - - newcg = find_css_set(tsk->cgroups, cgrp); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); - - for_each_subsys(root, ss) { - if (ss->attach) - ss->attach(cgrp, &tset); - } - -out: - if (retval) { - for_each_subsys(root, ss) { - if (ss == failed_ss) - /* - * This subsystem was the one that failed the - * can_attach() check earlier, so we don't need - * to call cancel_attach() against it or any - * remaining subsystems. - */ - break; - if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); - } - } - return retval; -} - /** * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' * @from: attach to all cgroups of a given task @@ -2033,7 +1957,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) for_each_active_root(root) { struct cgroup *from_cg = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk); + retval = cgroup_attach_task(from_cg, tsk, false); if (retval) break; } @@ -2044,21 +1968,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /** - * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to - * @leader: the threadgroup leader task_struct of the group to be attached + * @tsk: the task or the leader of the threadgroup to be attached + * @threadgroup: attach the whole threadgroup? * * Call holding cgroup_mutex and the group_rwsem of the leader. Will take - * task_lock of each thread in leader's threadgroup individually in turn. + * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; - /* guaranteed to be initialized later, but the compiler needs this */ struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ - struct task_struct *tsk; + struct task_struct *leader = tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; @@ -2070,7 +1995,10 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * group - group_rwsem prevents new threads from appearing, and if * threads exit, this will just be an over-estimate. */ - group_size = get_nr_threads(leader); + if (threadgroup) + group_size = get_nr_threads(tsk); + else + group_size = 1; /* flex_array supports very large thread-groups better than kmalloc. */ group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); if (!group) @@ -2080,7 +2008,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (retval) goto out_free_group_list; - tsk = leader; i = 0; /* * Prevent freeing of tasks while we take a snapshot. Tasks that are @@ -2109,6 +2036,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; + + if (!threadgroup) + break; } while_each_thread(leader, tsk); rcu_read_unlock(); /* remember the number of threads in the array for later. */ @@ -2262,9 +2192,10 @@ retry_find_task: put_task_struct(tsk); goto retry_find_task; } - ret = cgroup_attach_proc(cgrp, tsk); - } else - ret = cgroup_attach_task(cgrp, tsk); + } + + ret = cgroup_attach_task(cgrp, tsk, threadgroup); + threadgroup_unlock(tsk); put_task_struct(tsk); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index efbfca7a33e4..98d458aad789 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2008,7 +2008,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, struct cgroup *new_cgroup = scan->data; cgroup_lock(); - cgroup_attach_task(new_cgroup, tsk); + cgroup_attach_task(new_cgroup, tsk, false); cgroup_unlock(); } -- cgit v1.2.3 From 12ee4fc67c00895b3d740297f7ca447239c1983b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Wed, 20 Mar 2013 03:28:01 +0800 Subject: workqueue: add missing POOL_FREEZING get_unbound_pool() forgot to set POOL_FREEZING if workqueue_freezing is set and a new pool could go out of sync with the global freezing state. Fix it by adding POOL_FREEZING if workqueue_freezing. wq_mutex is already held so no further locking is necessary. This also removes the unused static variable warning when !CONFIG_FREEZER. tj: Updated commit message. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e38d035bf671..40f4017285a0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3503,6 +3503,9 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; + if (workqueue_freezing) + pool->flags |= POOL_FREEZING; + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); -- cgit v1.2.3 From 6a092dfd51e5af9b321d683d4b4eddc79e2606ed Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Wed, 20 Mar 2013 03:28:03 +0800 Subject: workqueue: simplify current_is_workqueue_rescuer() We can test worker->recue_wq instead of reaching into current_pwq->wq->rescuer and then comparing it to self. tj: Commit message. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 40f4017285a0..d2ac6cbfe8ab 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3936,7 +3936,7 @@ bool current_is_workqueue_rescuer(void) { struct worker *worker = current_wq_worker(); - return worker && worker == worker->current_pwq->wq->rescuer; + return worker && worker->rescue_wq; } /** -- cgit v1.2.3 From 951a078a5285ad31bc22e190616ad54b78fac992 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Wed, 20 Mar 2013 10:52:30 -0700 Subject: workqueue: kick a worker in pwq_adjust_max_active() If pwq_adjust_max_active() changes max_active from 0 to saved_max_active, it needs to wakeup worker. This is already done by thaw_workqueues(). If pwq_adjust_max_active() increases max_active for an unbound wq, while not strictly necessary for correctness, it's still desirable to wake up a worker so that the requested concurrency level is reached sooner. Move wake_up_worker() call from thaw_workqueues() to pwq_adjust_max_active() so that it can handle both of the above two cases. This also makes thaw_workqueues() simpler. tj: Updated comments and description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d2ac6cbfe8ab..79d1d347e690 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3598,6 +3598,12 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) while (!list_empty(&pwq->delayed_works) && pwq->nr_active < pwq->max_active) pwq_activate_first_delayed(pwq); + + /* + * Need to kick a worker after thawed or an unbound wq's + * max_active is bumped. It's a slow path. Do it always. + */ + wake_up_worker(pwq->pool); } else { pwq->max_active = 0; } @@ -4401,13 +4407,6 @@ void thaw_workqueues(void) } spin_unlock_irq(&pwq_lock); - /* kick workers */ - for_each_pool(pool, pi) { - spin_lock_irq(&pool->lock); - wake_up_worker(pool); - spin_unlock_irq(&pool->lock); - } - workqueue_freezing = false; out_unlock: mutex_unlock(&wq_mutex); -- cgit v1.2.3 From 881094532e2a27406a5f06f839087bd152a8a494 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Wed, 20 Mar 2013 03:28:10 +0800 Subject: workqueue: use rcu_read_lock_sched() instead for accessing pwq in RCU rcu_read_lock_sched() is better than preempt_disable() if the code is protected by RCU_SCHED. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 79d1d347e690..b6c5a524d7c4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3962,7 +3962,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) struct pool_workqueue *pwq; bool ret; - preempt_disable(); + rcu_read_lock_sched(); if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); @@ -3970,7 +3970,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) pwq = first_pwq(wq); ret = !list_empty(&pwq->delayed_works); - preempt_enable(); + rcu_read_unlock_sched(); return ret; } @@ -4354,16 +4354,16 @@ bool freeze_workqueues_busy(void) * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - preempt_disable(); + rcu_read_lock_sched(); for_each_pwq(pwq, wq) { WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; - preempt_enable(); + rcu_read_unlock_sched(); goto out_unlock; } } - preempt_enable(); + rcu_read_unlock_sched(); } out_unlock: mutex_unlock(&wq_mutex); -- cgit v1.2.3 From 519e3c1163ce2b2d510b76b0f5b374198f9378f3 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Wed, 20 Mar 2013 03:28:21 +0800 Subject: workqueue: avoid false negative in assert_manager_or_pool_lock() If lockdep complains something for other subsystem, lockdep_is_held() can be false negative, so we need to also test debug_locks before triggering WARN. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6c5a524d7c4..47f258799bf2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -305,7 +305,8 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #ifdef CONFIG_LOCKDEP #define assert_manager_or_pool_lock(pool) \ - WARN_ONCE(!lockdep_is_held(&(pool)->manager_mutex) && \ + WARN_ONCE(debug_locks && \ + !lockdep_is_held(&(pool)->manager_mutex) && \ !lockdep_is_held(&(pool)->lock), \ "pool->manager_mutex or ->lock should be held") #else -- cgit v1.2.3 From 22f45649ce08642ad7df238d5c25fa5c86bfdd31 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 15 Mar 2013 17:23:20 -0400 Subject: tracing: Update debugfs README file Update the README file in debugfs/tracing to something more useful. What's currently in the file is very old and what it shows doesn't have much use. Heck, it tells you how to mount debugfs! But to read this file you would have already needed to mount it. Replace the file with current up-to-date information. It's rather limited, but what do you expect from a pseudo README file. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 92 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3dc7999594e1..829b2bee24e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3300,20 +3300,84 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" - "# mount -t debugfs nodev /sys/kernel/debug\n\n" - "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "nop\n" - "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "wakeup\n" - "# cat /sys/kernel/debug/tracing/trace_options\n" - "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" - "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" + "# echo 0 > tracing_on : quick way to disable tracing\n" + "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" + " Important files:\n" + " trace\t\t\t- The static contents of the buffer\n" + "\t\t\t To clear the buffer write into this file: echo > trace\n" + " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" + " current_tracer\t- function and latency tracers\n" + " available_tracers\t- list of configured tracers for current_tracer\n" + " buffer_size_kb\t- view and modify size of per cpu buffer\n" + " buffer_total_size_kb - view total size of all cpu buffers\n\n" + " trace_clock\t\t-change the clock used to order events\n" + " local: Per cpu clock but may not be synced across CPUs\n" + " global: Synced across CPUs but slows tracing down.\n" + " counter: Not a clock, but just an increment\n" + " uptime: Jiffy counter from time of boot\n" + " perf: Same clock that perf events use\n" +#ifdef CONFIG_X86_64 + " x86-tsc: TSC cycle counter\n" +#endif + "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + " tracing_cpumask\t- Limit which CPUs to trace\n" + " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" + "\t\t\t Remove sub-buffer with rmdir\n" + " trace_options\t\t- Set format or modify how tracing happens\n" + "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" +#ifdef CONFIG_DYNAMIC_FTRACE + "\n available_filter_functions - list of functions that can be filtered on\n" + " set_ftrace_filter\t- echo function name in here to only trace these functions\n" + " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + " modules: Can select a group via module\n" + " Format: :mod:<module-name>\n" + " example: echo :mod:ext3 > set_ftrace_filter\n" + " triggers: a command to perform when function is hit\n" + " Format: <function>:<trigger>[:count]\n" + " trigger: traceon, traceoff\n" + " enable_event:<system>:<event>\n" + " disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + " stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + " snapshot\n" +#endif + " example: echo do_fault:traceoff > set_ftrace_filter\n" + " echo do_trap:traceoff:3 > set_ftrace_filter\n" + " The first one will disable tracing every time do_fault is hit\n" + " The second will disable tracing at most 3 times when do_trap is hit\n" + " The first time do trap is hit and it disables tracing, the counter\n" + " will decrement to 2. If tracing is already disabled, the counter\n" + " will not decrement. It only decrements when the trigger did work\n" + " To remove trigger without count:\n" + " echo '!<function>:<trigger> > set_ftrace_filter\n" + " To remove trigger with a count:\n" + " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + " set_ftrace_notrace\t- echo function name in here to never trace.\n" + " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + " modules: Can select a group via module command :mod:\n" + " Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER + " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" + "\t\t\t Read the contents for more information\n" +#endif +#ifdef CONFIG_STACKTRACE + " stack_trace\t\t- Shows the max stack trace when active\n" + " stack_max_size\t- Shows current max stack size that was traced\n" + "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" +#endif +#endif /* CONFIG_STACKTRACE */ ; static ssize_t -- cgit v1.2.3 From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 18 Mar 2013 12:22:34 -0700 Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s try_to_wake_up_local() should only be invoked to wake up another task in the same runqueue and BUG_ON()s are used to enforce the rule. Missing try_to_wake_up_local() can stall workqueue execution but such stalls are likely to be finite either by another work item being queued or the one blocked getting unblocked. There's no reason to trigger BUG while holding rq lock crashing the whole system. Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7b03cd2d4cd..306943f531a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { -- cgit v1.2.3 From dd9c086d9f507d02d5ba4d7c5eef4bb9518088b8 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Mon, 18 Mar 2013 14:33:28 +0100 Subject: perf: Fix ring_buffer perf_output_space() boundary calculation This patch fixes a flaw in perf_output_space(). In case the size of the space needed is bigger than the actual buffer size, there may be situations where the function would return true (i.e., there is space) when it should not. head > offset due to rounding of the masking logic. The problem can be tested by activating BTS on Intel processors. A BTS record can be as big as 16 pages. The following command fails: $ perf record -m 4 -c 1 -e branches:u my_test_program You will get a buffer corruption with this. Perf report won't be able to parse the perf.data. The fix is to first check that the requested space is smaller than the buffer size. If so, then the masking logic will work fine. If not, then there is no chance the record can be saved and it will be gracefully handled by upper code layers. [ In v2, we also make the logic for the writable more explicit by renaming it to rb->overwrite because it tells whether or not the buffer can overwrite its tail (suggested by PeterZ). ] Signed-off-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: peterz@infradead.org Cc: jolsa@redhat.com Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/20130318133327.GA3056@quad Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/internal.h | 2 +- kernel/events/ring_buffer.c | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8b..eb675c4d59df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer { int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ + int overwrite; /* can overwrite itself */ atomic_t poll; /* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff3973..97fddb09762b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@ static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, unsigned long offset, unsigned long head) { - unsigned long mask; + unsigned long sz = perf_data_size(rb); + unsigned long mask = sz - 1; - if (!rb->writable) + /* + * check if user-writable + * overwrite : over-write its own tail + * !overwrite: buffer possibly drops events. + */ + if (rb->overwrite) return true; - mask = perf_data_size(rb) - 1; + /* + * verify that payload is not bigger than buffer + * otherwise masking logic may fail to detect + * the "not enough space" condition + */ + if ((head - offset) > sz) + return false; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) - rb->writable = 1; + rb->overwrite = 0; + else + rb->overwrite = 1; atomic_set(&rb->refcount, 1); -- cgit v1.2.3 From dc72c32e1fd872a9a4fdfe645283c9dcd68e556d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker <fweisbec@gmail.com> Date: Fri, 22 Mar 2013 15:04:39 -0700 Subject: printk: Provide a wake_up_klogd() off-case wake_up_klogd() is useless when CONFIG_PRINTK=n because neither printk() nor printk_sched() are in use and there are actually no waiter on log_wait waitqueue. It should be a stub in this case for users like bust_spinlocks(). Otherwise this results in this warning when CONFIG_PRINTK=n and CONFIG_IRQ_WORK=n: kernel/built-in.o In function `wake_up_klogd': (.text.wake_up_klogd+0xb4): undefined reference to `irq_work_queue' To fix this, provide an off-case for wake_up_klogd() when CONFIG_PRINTK=n. There is much more from console_unlock() and other console related code in printk.c that should be moved under CONFIG_PRINTK. But for now, focus on a minimal fix as we passed the merged window already. [akpm@linux-foundation.org: include printk.h in bust_spinlocks.c] Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Reported-by: James Hogan <james.hogan@imgtec.com> Cc: James Hogan <james.hogan@imgtec.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Ingo Molnar <mingo@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/kernel.h | 1 - include/linux/printk.h | 6 ++++ kernel/printk.c | 80 ++++++++++++++++++++++++-------------------------- lib/bust_spinlocks.c | 3 +- 4 files changed, 46 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 80d36874689b..79fdd80a42d4 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -390,7 +390,6 @@ extern struct pid *session_of_pgrp(struct pid *pgrp); unsigned long int_sqrt(unsigned long); extern void bust_spinlocks(int yes); -extern void wake_up_klogd(void); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; extern int panic_on_oops; diff --git a/include/linux/printk.h b/include/linux/printk.h index 1249a54d17e0..822171fcb1c8 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -134,6 +134,8 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int kptr_restrict; +extern void wake_up_klogd(void); + void log_buf_kexec_setup(void); void __init setup_log_buf(int early); #else @@ -162,6 +164,10 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, return false; } +static inline void wake_up_klogd(void) +{ +} + static inline void log_buf_kexec_setup(void) { } diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335a..abbdd9e2ac82 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { diff --git a/lib/bust_spinlocks.c b/lib/bust_spinlocks.c index 9681d54b95d1..f8e0e5367398 100644 --- a/lib/bust_spinlocks.c +++ b/lib/bust_spinlocks.c @@ -8,6 +8,7 @@ */ #include <linux/kernel.h> +#include <linux/printk.h> #include <linux/spinlock.h> #include <linux/tty.h> #include <linux/wait.h> @@ -28,5 +29,3 @@ void __attribute__((weak)) bust_spinlocks(int yes) wake_up_klogd(); } } - - -- cgit v1.2.3 From 2ca067efd82939dfd87827d29d36a265823a4c2f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Fri, 22 Mar 2013 15:04:41 -0700 Subject: poweroff: change orderly_poweroff() to use schedule_work() David said: Commit 6c0c0d4d1080 ("poweroff: fix bug in orderly_poweroff()") apparently fixes one bug in orderly_poweroff(), but introduces another. The comments on orderly_poweroff() claim it can be called from any context - and indeed we call it from interrupt context in arch/powerpc/platforms/pseries/ras.c for example. But since that commit this is no longer safe, since call_usermodehelper_fns() is not safe in interrupt context without the UMH_NO_WAIT option. orderly_poweroff() can be used from any context but UMH_WAIT_EXEC is sleepable. Move the "force" logic into __orderly_poweroff() and change orderly_poweroff() to use the global poweroff_work which simply calls __orderly_poweroff(). While at it, remove the unneeded "int argc" and change argv_split() to use GFP_KERNEL. We use the global "bool poweroff_force" to pass the argument, this can obviously affect the previous request if it is pending/running. So we only allow the "false => true" transition assuming that the pending "true" should succeed anyway. If schedule_work() fails after that we know that work->func() was not called yet, it must see the new value. This means that orderly_poweroff() becomes async even if we do not run the command and always succeeds, schedule_work() can only fail if the work is already pending. We can export __orderly_poweroff() and change the non-atomic callers which want the old semantics. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Reported-by: Benjamin Herrenschmidt <benh@kernel.crashing.org> Reported-by: David Gibson <david@gibson.dropbear.id.au> Cc: Lucas De Marchi <lucas.demarchi@profusion.mobi> Cc: Feng Hong <hongfeng@marvell.com> Cc: Kees Cook <keescook@chromium.org> Cc: Serge Hallyn <serge.hallyn@canonical.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/sys.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba9..39c9c4a2949f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2185,9 +2185,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force) { - int argc; char **argv; static char *envp[] = { "HOME=/", @@ -2196,20 +2195,40 @@ static int __orderly_poweroff(void) }; int ret; - argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - if (argv == NULL) { + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - return -ENOMEM; + __func__, poweroff_cmd); + ret = -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, NULL, NULL); - argv_free(argv); + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } return ret; } +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + /** * orderly_poweroff - Trigger an orderly system poweroff * @force: force poweroff if command execution fails @@ -2219,21 +2238,9 @@ static int __orderly_poweroff(void) */ int orderly_poweroff(bool force) { - int ret = __orderly_poweroff(); - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); -- cgit v1.2.3 From 68e13a67ddfb55af386b903ab9ca56358930f79c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Mon, 25 Mar 2013 16:57:17 -0700 Subject: workqueue: rename wq_mutex to wq_pool_mutex wq->flush_mutex will be renamed to wq->mutex and cover all fields specific to each workqueue and eventually replace pwq_lock, which will make locking simpler and easier to understand. Rename wq_mutex to wq_pool_mutex to avoid confusion with wq->mutex. After the scheduled changes, wq_pool_mutex won't be protecting anything specific to each workqueue instance anyway. This patch is pure rename. tj: s/wqs_mutex/wq_pool_mutex/. Rewrote description. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 109 +++++++++++++++++++++++++++-------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 47f258799bf2..064157eac4c8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -123,9 +123,9 @@ enum { * MG: pool->manager_mutex and pool->lock protected. Writes require both * locks. Reads can happen under either lock. * - * WQ: wq_mutex protected. + * PL: wq_pool_mutex protected. * - * WR: wq_mutex protected for writes. Sched-RCU protected for reads. + * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * * PW: pwq_lock protected. * @@ -163,8 +163,8 @@ struct worker_pool { struct idr worker_idr; /* MG: worker IDs and iteration */ struct workqueue_attrs *attrs; /* I: worker attributes */ - struct hlist_node hash_node; /* WQ: unbound_pool_hash node */ - int refcnt; /* WQ: refcnt for unbound pools */ + struct hlist_node hash_node; /* PL: unbound_pool_hash node */ + int refcnt; /* PL: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -226,10 +226,10 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* WQ: WQ_* flags */ + unsigned int flags; /* PL: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* FR: all pwqs of this wq */ - struct list_head list; /* WQ: list of all workqueues */ + struct list_head list; /* PL: list of all workqueues */ struct mutex flush_mutex; /* protects wq flushing */ int work_color; /* F: current work color */ @@ -242,7 +242,7 @@ struct workqueue_struct { struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* WQ: drain in progress */ + int nr_drainers; /* PL: drain in progress */ int saved_max_active; /* PW: saved pwq max_active */ #ifdef CONFIG_SYSFS @@ -256,20 +256,20 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; -static DEFINE_MUTEX(wq_mutex); /* protects workqueues and pools */ +static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* WQ: list of all workqueues */ -static bool workqueue_freezing; /* WQ: have wqs started freezing? */ +static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); -static DEFINE_IDR(worker_pool_idr); /* WR: idr of all pools */ +static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ -/* WQ: hash of all unbound pools keyed by pool->attrs */ +/* PL: hash of all unbound pools keyed by pool->attrs */ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); /* I: attributes used when instantiating standard unbound pools on demand */ @@ -293,10 +293,10 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> -#define assert_rcu_or_wq_mutex() \ +#define assert_rcu_or_pool_mutex() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq_mutex), \ - "sched RCU or wq_mutex should be held") + lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_pwq_lock() \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ @@ -323,16 +323,16 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pool: iteration cursor * @pi: integer used for iteration * - * This must be called either with wq_mutex held or sched RCU read locked. - * If the pool needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pool stays online. + * This must be called either with wq_pool_mutex held or sched RCU read + * locked. If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online. * * The if/else clause exists only for the lockdep assertion and can be * ignored. */ #define for_each_pool(pool, pi) \ idr_for_each_entry(&worker_pool_idr, pool, pi) \ - if (({ assert_rcu_or_wq_mutex(); false; })) { } \ + if (({ assert_rcu_or_pool_mutex(); false; })) { } \ else /** @@ -489,7 +489,7 @@ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; - lockdep_assert_held(&wq_mutex); + lockdep_assert_held(&wq_pool_mutex); do { if (!idr_pre_get(&worker_pool_idr, GFP_KERNEL)) @@ -607,9 +607,9 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * * Return the worker_pool @work was last associated with. %NULL if none. * - * Pools are created and destroyed under wq_mutex, and allows read access - * under sched-RCU read lock. As such, this function should be called - * under wq_mutex or with preemption disabled. + * Pools are created and destroyed under wq_pool_mutex, and allows read + * access under sched-RCU read lock. As such, this function should be + * called under wq_pool_mutex or with preemption disabled. * * All fields of the returned pool are accessible as long as the above * mentioned locking is in effect. If the returned pool needs to be used @@ -621,7 +621,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) unsigned long data = atomic_long_read(&work->data); int pool_id; - assert_rcu_or_wq_mutex(); + assert_rcu_or_pool_mutex(); if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) @@ -2684,10 +2684,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); reflush: flush_workqueue(wq); @@ -2714,10 +2714,10 @@ reflush: local_irq_enable(); - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -3430,16 +3430,16 @@ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (--pool->refcnt) { - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return; } /* sanity checks */ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || WARN_ON(!list_empty(&pool->worklist))) { - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return; } @@ -3448,7 +3448,7 @@ static void put_unbound_pool(struct worker_pool *pool) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); /* * Become the manager and destroy all workers. Grabbing @@ -3489,7 +3489,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { @@ -3520,10 +3520,10 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); out_unlock: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return pool; fail: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); if (pool) put_unbound_pool(pool); return NULL; @@ -3803,10 +3803,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_destroy; /* - * wq_mutex protects global freeze state and workqueues list. Grab - * it, adjust max_active and add the new @wq to workqueues list. + * wq_pool_mutex protects global freeze state and workqueues list. + * Grab it, adjust max_active and add the new @wq to workqueues + * list. */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); spin_lock_irq(&pwq_lock); for_each_pwq(pwq, wq) @@ -3815,7 +3816,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, list_add(&wq->list, &workqueues); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return wq; @@ -3866,9 +3867,9 @@ void destroy_workqueue(struct workqueue_struct *wq) * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); list_del_init(&wq->list); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); @@ -4198,7 +4199,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, case CPU_DOWN_FAILED: case CPU_ONLINE: - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); for_each_pool(pool, pi) { mutex_lock(&pool->manager_mutex); @@ -4216,7 +4217,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_unlock(&pool->manager_mutex); } - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); break; } return NOTIFY_OK; @@ -4292,7 +4293,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases wq_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4301,7 +4302,7 @@ void freeze_workqueues_begin(void) struct pool_workqueue *pwq; int pi; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; @@ -4322,7 +4323,7 @@ void freeze_workqueues_begin(void) } spin_unlock_irq(&pwq_lock); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } /** @@ -4332,7 +4333,7 @@ void freeze_workqueues_begin(void) * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: - * Grabs and releases wq_mutex. + * Grabs and releases wq_pool_mutex. * * RETURNS: * %true if some freezable workqueues are still busy. %false if freezing @@ -4344,7 +4345,7 @@ bool freeze_workqueues_busy(void) struct workqueue_struct *wq; struct pool_workqueue *pwq; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); WARN_ON_ONCE(!workqueue_freezing); @@ -4367,7 +4368,7 @@ bool freeze_workqueues_busy(void) rcu_read_unlock_sched(); } out_unlock: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); return busy; } @@ -4378,7 +4379,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases wq_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. */ void thaw_workqueues(void) { @@ -4387,7 +4388,7 @@ void thaw_workqueues(void) struct worker_pool *pool; int pi; - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; @@ -4410,7 +4411,7 @@ void thaw_workqueues(void) workqueue_freezing = false; out_unlock: - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } #endif /* CONFIG_FREEZER */ @@ -4442,9 +4443,9 @@ static int __init init_workqueues(void) pool->attrs->nice = std_nice[i++]; /* alloc pool ID */ - mutex_lock(&wq_mutex); + mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); - mutex_unlock(&wq_mutex); + mutex_unlock(&wq_pool_mutex); } } -- cgit v1.2.3 From 3c25a55daadc7e7058926f5728fba7721d824ffb Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Mon, 25 Mar 2013 16:57:17 -0700 Subject: workqueue: rename wq->flush_mutex to wq->mutex Currently pwq->flush_mutex protects many fields of a workqueue including, especially, the pwqs list. We're going to expand this mutex to protect most of a workqueue and eventually replace pwq_lock, which will make locking simpler and easier to understand. Drop the "flush_" prefix in preparation. This patch is pure rename. tj: Rebased on top of the current dev branch. Updated description. Use WQ: and WR: instead of Q: and QR: for synchronization labels. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 064157eac4c8..d448edae3513 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -118,8 +118,6 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * F: wq->flush_mutex protected. - * * MG: pool->manager_mutex and pool->lock protected. Writes require both * locks. Reads can happen under either lock. * @@ -129,8 +127,10 @@ enum { * * PW: pwq_lock protected. * - * FR: wq->flush_mutex and pwq_lock protected for writes. Sched-RCU - * protected for reads. + * WQ: wq->mutex protected. + * + * WR: wq->mutex and pwq_lock protected for writes. Sched-RCU protected + * for reads. * * MD: wq_mayday_lock protected. */ @@ -197,7 +197,7 @@ struct pool_workqueue { int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ - struct list_head pwqs_node; /* FR: node on wq->pwqs */ + struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ /* @@ -214,8 +214,8 @@ struct pool_workqueue { * Structure used to wait for workqueue flush. */ struct wq_flusher { - struct list_head list; /* F: list of flushers */ - int flush_color; /* F: flush color waiting for */ + struct list_head list; /* WQ: list of flushers */ + int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; @@ -228,16 +228,16 @@ struct wq_device; struct workqueue_struct { unsigned int flags; /* PL: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ - struct list_head pwqs; /* FR: all pwqs of this wq */ + struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PL: list of all workqueues */ - struct mutex flush_mutex; /* protects wq flushing */ - int work_color; /* F: current work color */ - int flush_color; /* F: current flush color */ + struct mutex mutex; /* protects this wq */ + int work_color; /* WQ: current work color */ + int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ - struct wq_flusher *first_flusher; /* F: first flusher */ - struct list_head flusher_queue; /* F: flush waiters */ - struct list_head flusher_overflow; /* F: flush overflow list */ + struct wq_flusher *first_flusher; /* WQ: first flusher */ + struct list_head flusher_queue; /* WQ: flush waiters */ + struct list_head flusher_overflow; /* WQ: flush overflow list */ struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ @@ -2460,7 +2460,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * advanced to @work_color. * * CONTEXT: - * mutex_lock(wq->flush_mutex). + * mutex_lock(wq->mutex). * * RETURNS: * %true if @flush_color >= 0 and there's something to flush. %false @@ -2529,7 +2529,7 @@ void flush_workqueue(struct workqueue_struct *wq) lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* * Start-to-wait phase @@ -2574,7 +2574,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2587,7 +2587,7 @@ void flush_workqueue(struct workqueue_struct *wq) if (wq->first_flusher != &this_flusher) return; - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) @@ -2659,7 +2659,7 @@ void flush_workqueue(struct workqueue_struct *wq) } out_unlock: - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -3550,15 +3550,15 @@ static void pwq_unbound_release_workfn(struct work_struct *work) return; /* - * Unlink @pwq. Synchronization against flush_mutex isn't strictly + * Unlink @pwq. Synchronization against wq->mutex isn't strictly * necessary on release but do it anyway. It's easier to verify * and consistent with the linking path. */ - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); spin_lock_irq(&pwq_lock); list_del_rcu(&pwq->pwqs_node); spin_unlock_irq(&pwq_lock); - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); put_unbound_pool(pool); call_rcu_sched(&pwq->rcu, rcu_free_pwq); @@ -3627,12 +3627,12 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); spin_lock_irq(&pwq_lock); /* * Set the matching work_color. This is synchronized with - * flush_mutex to avoid confusing flush_workqueue(). + * wq->mutex to avoid confusing flush_workqueue(). */ if (p_last_pwq) *p_last_pwq = first_pwq(wq); @@ -3645,7 +3645,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, list_add_rcu(&pwq->pwqs_node, &wq->pwqs); spin_unlock_irq(&pwq_lock); - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); } /** @@ -3762,7 +3762,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; - mutex_init(&wq->flush_mutex); + mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); -- cgit v1.2.3 From 87fc741e94cf64445c698486982b30afa0811eca Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Mon, 25 Mar 2013 16:57:18 -0700 Subject: workqueue: protect wq->nr_drainers and ->flags with wq->mutex We're expanding wq->mutex to cover all fields specific to each workqueue with the end goal of replacing pwq_lock which will make locking simpler and easier to understand. wq->nr_drainers and ->flags are specific to each workqueue. Protect ->nr_drainers and ->flags with wq->mutex instead of pool_mutex. tj: Rebased on top of the current dev branch. Updated description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d448edae3513..3ac2c4d85607 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -226,7 +226,7 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* PL: WQ_* flags */ + unsigned int flags; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PL: list of all workqueues */ @@ -242,7 +242,7 @@ struct workqueue_struct { struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* PL: drain in progress */ + int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* PW: saved pwq max_active */ #ifdef CONFIG_SYSFS @@ -2684,10 +2684,10 @@ void drain_workqueue(struct workqueue_struct *wq) * hotter than drain_workqueue() and already looks at @wq->flags. * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq->mutex); if (!wq->nr_drainers++) wq->flags |= __WQ_DRAINING; - mutex_unlock(&wq_pool_mutex); + mutex_unlock(&wq->mutex); reflush: flush_workqueue(wq); @@ -2714,10 +2714,10 @@ reflush: local_irq_enable(); - mutex_lock(&wq_pool_mutex); + mutex_lock(&wq->mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; - mutex_unlock(&wq_pool_mutex); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); -- cgit v1.2.3 From b09f4fd39c0e562aff3682773f4c451d6125048c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Mon, 25 Mar 2013 16:57:18 -0700 Subject: workqueue: protect wq->pwqs and iteration with wq->mutex We're expanding wq->mutex to cover all fields specific to each workqueue with the end goal of replacing pwq_lock which will make locking simpler and easier to understand. init_and_link_pwq() and pwq_unbound_release_workfn() already grab wq->mutex when adding or removing a pwq from wq->pwqs list. This patch makes it official that the list is wq->mutex protected for writes and updates readers accoridingly. Explicit IRQ toggles for sched-RCU read-locking in flush_workqueue_prep_pwqs() and drain_workqueues() are removed as the surrounding wq->mutex can provide sufficient synchronization. Also, assert_rcu_or_pwq_lock() is renamed to assert_rcu_or_wq_mutex() and checks for wq->mutex too. pwq_lock locking and assertion are not removed by this patch and a couple of for_each_pwq() iterations are still protected by it. They'll be removed by future patches. tj: Rebased on top of the current dev branch. Updated description. Folded in assert_rcu_or_wq_mutex() renaming from a later patch along with associated comment updates. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 42 ++++++++++++++++++------------------------ 1 file changed, 18 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3ac2c4d85607..9c32fd171d5c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -204,7 +204,7 @@ struct pool_workqueue { * Release of unbound pwq is punted to system_wq. See put_pwq() * and pwq_unbound_release_workfn() for details. pool_workqueue * itself is also sched-RCU protected so that the first pwq can be - * determined without grabbing pwq_lock. + * determined without grabbing wq->mutex. */ struct work_struct unbound_release_work; struct rcu_head rcu; @@ -298,10 +298,11 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, lockdep_is_held(&wq_pool_mutex), \ "sched RCU or wq_pool_mutex should be held") -#define assert_rcu_or_pwq_lock() \ +#define assert_rcu_or_wq_mutex(wq) \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq->mutex) || \ lockdep_is_held(&pwq_lock), \ - "sched RCU or pwq_lock should be held") + "sched RCU or wq->mutex should be held") #ifdef CONFIG_LOCKDEP #define assert_manager_or_pool_lock(pool) \ @@ -356,7 +357,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, * @pwq: iteration cursor * @wq: the target workqueue * - * This must be called either with pwq_lock held or sched RCU read locked. + * This must be called either with wq->mutex held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. * @@ -365,7 +366,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, */ #define for_each_pwq(pwq, wq) \ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ - if (({ assert_rcu_or_pwq_lock(); false; })) { } \ + if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -504,13 +505,13 @@ static int worker_pool_assign_id(struct worker_pool *pool) * first_pwq - return the first pool_workqueue of the specified workqueue * @wq: the target workqueue * - * This must be called either with pwq_lock held or sched RCU read locked. + * This must be called either with wq->mutex held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. */ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) { - assert_rcu_or_pwq_lock(); + assert_rcu_or_wq_mutex(wq); return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, pwqs_node); } @@ -2477,12 +2478,10 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, atomic_set(&wq->nr_pwqs_to_flush, 1); } - local_irq_disable(); - for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; - spin_lock(&pool->lock); + spin_lock_irq(&pool->lock); if (flush_color >= 0) { WARN_ON_ONCE(pwq->flush_color != -1); @@ -2499,11 +2498,9 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, pwq->work_color = work_color; } - spin_unlock(&pool->lock); + spin_unlock_irq(&pool->lock); } - local_irq_enable(); - if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) complete(&wq->first_flusher->done); @@ -2691,14 +2688,14 @@ void drain_workqueue(struct workqueue_struct *wq) reflush: flush_workqueue(wq); - local_irq_disable(); + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { bool drained; - spin_lock(&pwq->pool->lock); + spin_lock_irq(&pwq->pool->lock); drained = !pwq->nr_active && list_empty(&pwq->delayed_works); - spin_unlock(&pwq->pool->lock); + spin_unlock_irq(&pwq->pool->lock); if (drained) continue; @@ -2708,13 +2705,10 @@ reflush: pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); - local_irq_enable(); + mutex_unlock(&wq->mutex); goto reflush; } - local_irq_enable(); - - mutex_lock(&wq->mutex); if (!--wq->nr_drainers) wq->flags &= ~__WQ_DRAINING; mutex_unlock(&wq->mutex); @@ -3843,13 +3837,13 @@ void destroy_workqueue(struct workqueue_struct *wq) drain_workqueue(wq); /* sanity checks */ - spin_lock_irq(&pwq_lock); + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) { int i; for (i = 0; i < WORK_NR_COLORS; i++) { if (WARN_ON(pwq->nr_in_flight[i])) { - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); return; } } @@ -3857,11 +3851,11 @@ void destroy_workqueue(struct workqueue_struct *wq) if (WARN_ON(pwq->refcnt > 1) || WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); return; } } - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); /* * wq list is used to freeze wq, remove from list after -- cgit v1.2.3 From a357fc03262988f2aa6c4a668b89be22b11ff1e7 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Mon, 25 Mar 2013 16:57:19 -0700 Subject: workqueue: protect wq->saved_max_active with wq->mutex We're expanding wq->mutex to cover all fields specific to each workqueue with the end goal of replacing pwq_lock which will make locking simpler and easier to understand. This patch makes wq->saved_max_active protected by wq->mutex instead of pwq_lock. As pwq_lock locking around pwq_adjust_mac_active() is no longer necessary, this patch also replaces pwq_lock lockings of for_each_pwq() around pwq_adjust_max_active() to wq->mutex. tj: Rebased on top of the current dev branch. Updated description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9c32fd171d5c..af6087a5a10a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -243,7 +243,7 @@ struct workqueue_struct { struct worker *rescuer; /* I: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* PW: saved pwq max_active */ + int saved_max_active; /* WQ: saved pwq max_active */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -3579,13 +3579,13 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) bool freezable = wq->flags & WQ_FREEZABLE; /* for @wq->saved_max_active */ - lockdep_assert_held(&pwq_lock); + lockdep_assert_held(&wq->mutex); /* fast exit for non-freezable wqs */ if (!freezable && pwq->max_active == wq->saved_max_active) return; - spin_lock(&pwq->pool->lock); + spin_lock_irq(&pwq->pool->lock); if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { pwq->max_active = wq->saved_max_active; @@ -3603,7 +3603,7 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) pwq->max_active = 0; } - spin_unlock(&pwq->pool->lock); + spin_unlock_irq(&pwq->pool->lock); } static void init_and_link_pwq(struct pool_workqueue *pwq, @@ -3622,7 +3622,6 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); mutex_lock(&wq->mutex); - spin_lock_irq(&pwq_lock); /* * Set the matching work_color. This is synchronized with @@ -3636,9 +3635,10 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq_adjust_max_active(pwq); /* link in @pwq */ + spin_lock_irq(&pwq_lock); list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); } @@ -3803,10 +3803,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, */ mutex_lock(&wq_pool_mutex); - spin_lock_irq(&pwq_lock); + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); list_add(&wq->list, &workqueues); @@ -3917,14 +3917,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock_irq(&pwq_lock); + mutex_lock(&wq->mutex); wq->saved_max_active = max_active; for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); - spin_unlock_irq(&pwq_lock); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(workqueue_set_max_active); @@ -4287,7 +4287,7 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * pool->worklist. * * CONTEXT: - * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { @@ -4309,13 +4309,12 @@ void freeze_workqueues_begin(void) spin_unlock_irq(&pool->lock); } - /* suppress further executions by setting max_active to zero */ - spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); } - spin_unlock_irq(&pwq_lock); mutex_unlock(&wq_pool_mutex); } @@ -4373,7 +4372,7 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases wq_pool_mutex, pwq_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { @@ -4396,12 +4395,12 @@ void thaw_workqueues(void) } /* restore max_active and repopulate worklist */ - spin_lock_irq(&pwq_lock); list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); for_each_pwq(pwq, wq) pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); } - spin_unlock_irq(&pwq_lock); workqueue_freezing = false; out_unlock: -- cgit v1.2.3 From b5927605478b740d73192f587e458de1632106e8 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Mon, 25 Mar 2013 16:57:19 -0700 Subject: workqueue: remove pwq_lock which is no longer used To simplify locking, the previous patches expanded wq->mutex to protect all fields of each workqueue instance including the pwqs list leaving pwq_lock without any user. Remove the unused pwq_lock. tj: Rebased on top of the current dev branch. Updated description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index af6087a5a10a..04a8b98d30ce 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -125,12 +125,9 @@ enum { * * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. * - * PW: pwq_lock protected. - * * WQ: wq->mutex protected. * - * WR: wq->mutex and pwq_lock protected for writes. Sched-RCU protected - * for reads. + * WR: wq->mutex protected for writes. Sched-RCU protected for reads. * * MD: wq_mayday_lock protected. */ @@ -257,7 +254,6 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ -static DEFINE_SPINLOCK(pwq_lock); /* protects pool_workqueues */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PL: list of all workqueues */ @@ -300,8 +296,7 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, #define assert_rcu_or_wq_mutex(wq) \ rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex) || \ - lockdep_is_held(&pwq_lock), \ + lockdep_is_held(&wq->mutex), \ "sched RCU or wq->mutex should be held") #ifdef CONFIG_LOCKDEP @@ -3549,9 +3544,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * and consistent with the linking path. */ mutex_lock(&wq->mutex); - spin_lock_irq(&pwq_lock); list_del_rcu(&pwq->pwqs_node); - spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->mutex); put_unbound_pool(pool); @@ -3635,9 +3628,7 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, pwq_adjust_max_active(pwq); /* link in @pwq */ - spin_lock_irq(&pwq_lock); list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - spin_unlock_irq(&pwq_lock); mutex_unlock(&wq->mutex); } -- cgit v1.2.3 From 751c644b95bb48aaa8825f0c66abbcc184d92051 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Tue, 26 Mar 2013 02:27:11 -0700 Subject: pid: Handle the exit of a multi-threaded init. When a multi-threaded init exits and the initial thread is not the last thread to exit the initial thread hangs around as a zombie until the last thread exits. In that case zap_pid_ns_processes needs to wait until there are only 2 hashed pids in the pid namespace not one. v2. Replace thread_pid_vnr(me) == 1 with the test thread_group_leader(me) as suggested by Oleg. Cc: stable@vger.kernel.org Cc: Oleg Nesterov <oleg@redhat.com> Reported-by: Caj Larsson <caj@omnicloud.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> --- kernel/pid_namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c6023..bea15bdf82b0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int nr; int rc; struct task_struct *task, *me = current; + int init_pids = thread_group_leader(me) ? 1 : 2; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + if (pid_ns->nr_hashed == init_pids) break; schedule(); } -- cgit v1.2.3 From 911af505ef407c2511106c224dd640f882f0f590 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Mon, 11 Feb 2013 10:23:27 -0800 Subject: rcu: Provide compile-time control for no-CBs CPUs Currently, the only way to specify no-CBs CPUs is via the rcu_nocbs kernel command-line parameter. This is inconvenient in some cases, particularly for randconfig testing, so this commit adds a new set of kernel configuration parameters. CONFIG_RCU_NOCB_CPU_NONE (the default) retains the old behavior, CONFIG_RCU_NOCB_CPU_ZERO offloads callback processing from CPU 0 (along with any other CPUs specified by the rcu_nocbs boot-time parameter), and CONFIG_RCU_NOCB_CPU_ALL offloads callback processing from all CPUs. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- init/Kconfig | 39 +++++++++++++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 14 ++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index c8bd349eb638..307499704580 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -676,6 +676,45 @@ config RCU_NOCB_CPU Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. +choice + prompt "Build-forced no-CBs CPUs" + default RCU_NOCB_CPU_NONE + help + This option allows no-CBs CPUs to be specified at build time. + Additional no-CBs CPUs may be specified by the rcu_nocbs= + boot parameter. + +config RCU_NOCB_CPU_NONE + bool "No build_forced no-CBs CPUs" + depends on RCU_NOCB_CPU + help + This option does not force any of the CPUs to be no-CBs CPUs. + Only CPUs designated by the rcu_nocbs= boot parameter will be + no-CBs CPUs. + +config RCU_NOCB_CPU_ZERO + bool "CPU 0 is a build_forced no-CBs CPU" + depends on RCU_NOCB_CPU + help + This option forces CPU 0 to be a no-CBs CPU. Additional CPUs + may be designated as no-CBs CPUs using the rcu_nocbs= boot + parameter will be no-CBs CPUs. + + Select this if CPU 0 needs to be a no-CBs CPU for real-time + or energy-efficiency reasons. + +config RCU_NOCB_CPU_ALL + bool "All CPUs are build_forced no-CBs CPUs" + depends on RCU_NOCB_CPU + help + This option forces all CPUs to be no-CBs CPUs. The rcu_nocbs= + boot parameter will be ignored. + + Select this if all CPUs need to be no-CBs CPUs for real-time + or energy-efficiency reasons. + +endchoice + endmenu # "RCU Subsystem" config IKCONFIG diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 44f958a88b21..3e33aefce0ea 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -85,6 +85,20 @@ static void __init rcu_bootup_announce_oddness(void) if (nr_cpu_ids != NR_CPUS) printk(KERN_INFO "\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); #ifdef CONFIG_RCU_NOCB_CPU +#ifndef CONFIG_RCU_NOCB_CPU_NONE + if (!have_rcu_nocb_mask) { + alloc_bootmem_cpumask_var(&rcu_nocb_mask); + have_rcu_nocb_mask = true; + } +#ifdef CONFIG_RCU_NOCB_CPU_ZERO + pr_info("\tExperimental no-CBs CPU 0\n"); + cpumask_set_cpu(0, rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ +#ifdef CONFIG_RCU_NOCB_CPU_ALL + pr_info("\tExperimental no-CBs for all CPUs\n"); + cpumask_setall(rcu_nocb_mask); +#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ +#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ if (have_rcu_nocb_mask) { cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); pr_info("\tExperimental no-CBs CPUs: %s.\n", nocb_buf); -- cgit v1.2.3 From dae6e64d2bcfd4b06304ab864c7e3a4f6b5fedf4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sun, 10 Feb 2013 20:48:58 -0800 Subject: rcu: Introduce proper blocking to no-CBs kthreads GP waits Currently, the no-CBs kthreads do repeated timed waits for grace periods to elapse. This is crude and energy inefficient, so this commit allows no-CBs kthreads to specify exactly which grace period they are waiting for and also allows them to block for the entire duration until the desired grace period completes. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 6 ++- kernel/rcutree.h | 12 ++++- kernel/rcutree_plugin.h | 129 +++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 127 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6ad0716e65dc..433f426c848f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -310,7 +310,7 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) if (rcu_gp_in_progress(rsp)) return 0; /* No, a grace period is already in progress. */ - if (rcu_nocb_needs_gp(rdp)) + if (rcu_nocb_needs_gp(rsp)) return 1; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) return 0; /* No, this is a no-CBs (or offline) CPU. */ @@ -1364,6 +1364,7 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) static void rcu_gp_cleanup(struct rcu_state *rsp) { unsigned long gp_duration; + int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1394,11 +1395,13 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); rnp->completed = rsp->gpnum; + nocb += rcu_nocb_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } rnp = rcu_get_root(rsp); raw_spin_lock_irq(&rnp->lock); + rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ trace_rcu_grace_period(rsp->name, rsp->completed, "end"); @@ -3084,6 +3087,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); + rcu_init_one_nocb(rnp); } } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7af39f4aaac4..e51373c0b748 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -196,6 +196,12 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_NOCB_CPU + wait_queue_head_t nocb_gp_wq[2]; + /* Place for rcu_nocb_kthread() to wait GP. */ + int n_nocb_gp_requests[2]; + /* Counts of upcoming no-CB GP requests. */ +#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; @@ -326,7 +332,6 @@ struct rcu_data { int nocb_p_count_lazy; /* (approximate). */ wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; - bool nocb_needs_gp; #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int cpu; @@ -524,7 +529,10 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); -static int rcu_nocb_needs_gp(struct rcu_data *rdp); +static int rcu_nocb_needs_gp(struct rcu_state *rsp); +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); +static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_init_one_nocb(struct rcu_node *rnp); static bool is_nocb_cpu(int cpu); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3e33aefce0ea..90a191452550 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2176,11 +2176,51 @@ static int __init parse_rcu_nocb_poll(char *arg) early_param("rcu_nocb_poll", parse_rcu_nocb_poll); /* - * Does this CPU needs a grace period due to offloaded callbacks? + * Do any no-CBs CPUs need another grace period? + * + * Interrupts must be disabled. If the caller does not hold the root + * rnp_node structure's ->lock, the results are advisory only. + */ +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + return rnp->n_nocb_gp_requests[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; +} + +/* + * Clean up this rcu_node structure's no-CBs state at the end of + * a grace period, and also return whether any no-CBs CPU associated + * with this rcu_node structure needs another grace period. + */ +static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + + wake_up_all(&rnp->nocb_gp_wq[c & 0x1]); + rnp->n_nocb_gp_requests[c & 0x1] = 0; + needmore = rnp->n_nocb_gp_requests[(c + 1) & 0x1]; + return needmore; +} + +/* + * Set the root rcu_node structure's ->n_nocb_gp_requests field + * based on the sum of those of all rcu_node structures. This does + * double-count the root rcu_node structure's requests, but this + * is necessary to handle the possibility of a rcu_nocb_kthread() + * having awakened during the time that the rcu_node structures + * were being updated for the end of the previous grace period. */ -static int rcu_nocb_needs_gp(struct rcu_data *rdp) +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ + rnp->n_nocb_gp_requests[(rnp->completed + 1) & 0x1] += nrq; +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) { - return rdp->nocb_needs_gp; + init_waitqueue_head(&rnp->nocb_gp_wq[0]); + init_waitqueue_head(&rnp->nocb_gp_wq[1]); } /* Is the specified CPU a no-CPUs CPU? */ @@ -2289,31 +2329,73 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, static void rcu_nocb_wait_gp(struct rcu_data *rdp) { unsigned long c; + bool d; unsigned long flags; - unsigned long j; + unsigned long flags1; struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); raw_spin_lock_irqsave(&rnp->lock, flags); c = rnp->completed + 2; - rdp->nocb_needs_gp = true; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* Count our request for a grace period. */ + rnp->n_nocb_gp_requests[c & 0x1]++; + + if (rnp->gpnum != rnp->completed) { + + /* + * This rcu_node structure believes that a grace period + * is in progress, so we are done. When this grace + * period ends, our request will be acted upon. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + } else { + + /* + * Might not be a grace period, check root rcu_node + * structure to see if we must start one. + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); /* irqs disabled. */ + if (rnp_root->gpnum != rnp_root->completed) { + raw_spin_unlock(&rnp_root->lock); /* irqs disabled. */ + } else { + + /* + * No grace period, so we need to start one. + * The good news is that we can wait for exactly + * one grace period instead of part of the current + * grace period and all of the next grace period. + * Adjust counters accordingly and start the + * needed grace period. + */ + rnp->n_nocb_gp_requests[c & 0x1]--; + c = rnp_root->completed + 1; + rnp->n_nocb_gp_requests[c & 0x1]++; + rnp_root->n_nocb_gp_requests[c & 0x1]++; + local_save_flags(flags1); + rcu_start_gp(rdp->rsp, flags1); /* Rlses ->lock. */ + } + + /* Clean up locking and irq state. */ + if (rnp != rnp_root) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else + local_irq_restore(flags); + } /* * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ for (;;) { - j = jiffies; - schedule_timeout_interruptible(2); - raw_spin_lock_irqsave(&rnp->lock, flags); - if (ULONG_CMP_GE(rnp->completed, c)) { - rdp->nocb_needs_gp = false; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + wait_event_interruptible( + rnp->nocb_gp_wq[c & 0x1], + (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); + if (likely(d)) break; - } - if (j == jiffies) - flush_signals(current); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + flush_signals(current); } smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2416,11 +2498,24 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ -static int rcu_nocb_needs_gp(struct rcu_data *rdp) +static int rcu_nocb_needs_gp(struct rcu_state *rsp) +{ + return 0; +} + +static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { return 0; } +static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) +{ +} + +static void rcu_init_one_nocb(struct rcu_node *rnp) +{ +} + static bool is_nocb_cpu(int cpu) { return false; -- cgit v1.2.3 From 21e7a6087480451804124cee27c0a7d0a7de1564 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sat, 9 Feb 2013 17:42:16 -0800 Subject: rcu: Add event tracing for no-CBs CPUs' callback registration Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree_plugin.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 90a191452550..7225a5a14cef 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2285,6 +2285,13 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, if (!is_nocb_cpu(rdp->cpu)) return 0; __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); + if (__is_kfree_rcu_offset((unsigned long)rhp->func)) + trace_rcu_kfree_callback(rdp->rsp->name, rhp, + (unsigned long)rhp->func, + rdp->qlen_lazy, rdp->qlen); + else + trace_rcu_callback(rdp->rsp->name, rhp, + rdp->qlen_lazy, rdp->qlen); return 1; } -- cgit v1.2.3 From 09c7b890622d72b5e004cc249bbe610e8b928ddf Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Fri, 8 Feb 2013 15:55:02 -0800 Subject: rcu: Add event tracing for no-CBs CPUs' grace periods Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/trace/events/rcu.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 30 +++++++++++++++++++++++++ 2 files changed, 85 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 1918e832da4f..cdfed6d386eb 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -71,6 +71,58 @@ TRACE_EVENT(rcu_grace_period, __entry->rcuname, __entry->gpnum, __entry->gpevent) ); +/* + * Tracepoint for no-callbacks grace-period events. The caller should + * pull the data from the rcu_node structure, other than rcuname, which + * comes from the rcu_state structure, and event, which is one of the + * following: + * + * "Startleaf": Request a nocb grace period based on leaf-node data. + * "Startedleaf": Leaf-node start proved sufficient. + * "Startedleafroot": Leaf-node start proved sufficient after checking root. + * "Startedroot": Requested a nocb grace period based on root-node data. + * "StartWait": Start waiting for the requested grace period. + * "ResumeWait": Resume waiting after signal. + * "EndWait": Complete wait. + * "Cleanup": Clean up rcu_node structure after previous GP. + * "CleanupMore": Clean up, and another no-CB GP is needed. + */ +TRACE_EVENT(rcu_nocb_grace_period, + + TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed, + unsigned long c, u8 level, int grplo, int grphi, + char *gpevent), + + TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent), + + TP_STRUCT__entry( + __field(char *, rcuname) + __field(unsigned long, gpnum) + __field(unsigned long, completed) + __field(unsigned long, c) + __field(u8, level) + __field(int, grplo) + __field(int, grphi) + __field(char *, gpevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->gpnum = gpnum; + __entry->completed = completed; + __entry->c = c; + __entry->level = level; + __entry->grplo = grplo; + __entry->grphi = grphi; + __entry->gpevent = gpevent; + ), + + TP_printk("%s %lu %lu %lu %u %d %d %s", + __entry->rcuname, __entry->gpnum, __entry->completed, + __entry->c, __entry->level, __entry->grplo, __entry->grphi, + __entry->gpevent) +); + /* * Tracepoint for grace-period-initialization events. These are * distinguished by the type of RCU, the new grace-period number, the @@ -601,6 +653,9 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ qsmask) do { } while (0) +#define trace_rcu_nocb_grace_period(rcuname, gpnum, completed, c, \ + level, grplo, grphi, event) \ + do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7225a5a14cef..e32236e83dda 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2201,6 +2201,9 @@ static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) wake_up_all(&rnp->nocb_gp_wq[c & 0x1]); rnp->n_nocb_gp_requests[c & 0x1] = 0; needmore = rnp->n_nocb_gp_requests[(c + 1) & 0x1]; + trace_rcu_nocb_grace_period(rsp->name, rnp->gpnum, rnp->completed, + c, rnp->level, rnp->grplo, rnp->grphi, + needmore ? "CleanupMore" : "Cleanup"); return needmore; } @@ -2347,6 +2350,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) /* Count our request for a grace period. */ rnp->n_nocb_gp_requests[c & 0x1]++; + trace_rcu_nocb_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, + c, rnp->level, rnp->grplo, rnp->grphi, + "Startleaf"); if (rnp->gpnum != rnp->completed) { @@ -2355,6 +2361,10 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * is in progress, so we are done. When this grace * period ends, our request will be acted upon. */ + trace_rcu_nocb_grace_period(rdp->rsp->name, + rnp->gpnum, rnp->completed, c, + rnp->level, rnp->grplo, rnp->grphi, + "Startedleaf"); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { @@ -2366,6 +2376,11 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (rnp != rnp_root) raw_spin_lock(&rnp_root->lock); /* irqs disabled. */ if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_nocb_grace_period(rdp->rsp->name, + rnp->gpnum, rnp->completed, + c, rnp->level, + rnp->grplo, rnp->grphi, + "Startedleafroot"); raw_spin_unlock(&rnp_root->lock); /* irqs disabled. */ } else { @@ -2381,6 +2396,11 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) c = rnp_root->completed + 1; rnp->n_nocb_gp_requests[c & 0x1]++; rnp_root->n_nocb_gp_requests[c & 0x1]++; + trace_rcu_nocb_grace_period(rdp->rsp->name, + rnp->gpnum, rnp->completed, + c, rnp->level, + rnp->grplo, rnp->grphi, + "Startedroot"); local_save_flags(flags1); rcu_start_gp(rdp->rsp, flags1); /* Rlses ->lock. */ } @@ -2396,6 +2416,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ + trace_rcu_nocb_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, + c, rnp->level, rnp->grplo, rnp->grphi, + "StartWait"); for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2403,7 +2426,14 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; flush_signals(current); + trace_rcu_nocb_grace_period(rdp->rsp->name, + rnp->gpnum, rnp->completed, c, + rnp->level, rnp->grplo, rnp->grphi, + "ResumeWait"); } + trace_rcu_nocb_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, + c, rnp->level, rnp->grplo, rnp->grphi, + "EndWait"); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } -- cgit v1.2.3 From a488985851cf2facd2227bd982cc2c251df56268 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Mon, 3 Dec 2012 08:16:28 -0800 Subject: rcu: Distinguish "rcuo" kthreads by RCU flavor Currently, the per-no-CBs-CPU kthreads are named "rcuo" followed by the CPU number, for example, "rcuo". This is problematic given that there are either two or three RCU flavors, each of which gets a per-CPU kthread with exactly the same name. This commit therefore introduces a one-letter abbreviation for each RCU flavor, namely 'b' for RCU-bh, 'p' for RCU-preempt, and 's' for RCU-sched. This abbreviation is used to distinguish the "rcuo" kthreads, for example, for CPU 0 we would have "rcuob/0", "rcuop/0", and "rcuos/0". Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Tested-by: Dietmar Eggemann <dietmar.eggemann@arm.com> --- Documentation/kernel-parameters.txt | 7 +++++-- init/Kconfig | 13 +++++++------ kernel/rcutree.c | 7 ++++--- kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 5 +++-- 5 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37..a17ba16c8fc8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2461,9 +2461,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. In kernels built with CONFIG_RCU_NOCB_CPU=y, set the specified list of CPUs to be no-callback CPUs. Invocation of these CPUs' RCU callbacks will - be offloaded to "rcuoN" kthreads created for - that purpose. This reduces OS jitter on the + be offloaded to "rcuox/N" kthreads created for + that purpose, where "x" is "b" for RCU-bh, "p" + for RCU-preempt, and "s" for RCU-sched, and "N" + is the CPU number. This reduces OS jitter on the offloaded CPUs, which can be useful for HPC and + real-time workloads. It can also improve energy efficiency for asymmetric multiprocessors. diff --git a/init/Kconfig b/init/Kconfig index 307499704580..717584064a7e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -666,12 +666,13 @@ config RCU_NOCB_CPU This option offloads callback invocation from the set of CPUs specified at boot time by the rcu_nocbs parameter. - For each such CPU, a kthread ("rcuoN") will be created to - invoke callbacks, where the "N" is the CPU being offloaded. - Nothing prevents this kthread from running on the specified - CPUs, but (1) the kthreads may be preempted between each - callback, and (2) affinity or cgroups can be used to force - the kthreads to run on whatever set of CPUs is desired. + For each such CPU, a kthread ("rcuox/N") will be created to + invoke callbacks, where the "N" is the CPU being offloaded, + and where the "x" is "b" for RCU-bh, "p" for RCU-preempt, and + "s" for RCU-sched. Nothing prevents this kthread from running + on the specified CPUs, but (1) the kthreads may be preempted + between each callback, and (2) affinity or cgroups can be used + to force the kthreads to run on whatever set of CPUs is desired. Say Y here if you want to help to debug reduced OS jitter. Say N here if you are unsure. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 433f426c848f..074cb2d974bf 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,7 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -76,13 +76,14 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ + .abbr = sabbr, \ } struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); + RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e51373c0b748..b6c2335efbdf 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -443,6 +443,7 @@ struct rcu_state { unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ char *name; /* Name of structure. */ + char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ }; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index e32236e83dda..c0164441ab92 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -111,7 +111,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, call_rcu); + RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -2517,7 +2517,8 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) return; for_each_cpu(cpu, rcu_nocb_mask) { rdp = per_cpu_ptr(rsp->rda, cpu); - t = kthread_run(rcu_nocb_kthread, rdp, "rcuo%d", cpu); + t = kthread_run(rcu_nocb_kthread, rdp, + "rcuo%c/%d", rsp->abbr, cpu); BUG_ON(IS_ERR(t)); ACCESS_ONCE(rdp->nocb_kthread) = t; } -- cgit v1.2.3 From 5e44ce35a6ec1a16522fa2099dda27aefd8a584e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Wed, 12 Dec 2012 12:35:29 -0800 Subject: rcu: Export RCU_FAST_NO_HZ parameters to sysfs RCU_FAST_NO_HZ operation is controlled by four compile-time C-preprocessor macros, but some use cases benefit greatly from runtime adjustment, particularly when tuning devices. This commit therefore creates the corresponding sysfs entries. Reported-by: Robin Randhawa <robin.randhawa@arm.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree_plugin.h | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0164441ab92..28185ad18df3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1617,6 +1617,15 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ +static int rcu_idle_flushes = RCU_IDLE_FLUSHES; +module_param(rcu_idle_flushes, int, 0644); +static int rcu_idle_opt_flushes = RCU_IDLE_OPT_FLUSHES; +module_param(rcu_idle_opt_flushes, int, 0644); +static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; +module_param(rcu_idle_gp_delay, int, 0644); +static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; +module_param(rcu_idle_lazy_gp_delay, int, 0644); + extern int tick_nohz_enabled; /* @@ -1696,10 +1705,10 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) } /* Set up for the possibility that RCU will post a timer. */ if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - *delta_jiffies = round_up(RCU_IDLE_GP_DELAY + jiffies, - RCU_IDLE_GP_DELAY) - jiffies; + *delta_jiffies = round_up(rcu_idle_gp_delay + jiffies, + rcu_idle_gp_delay) - jiffies; } else { - *delta_jiffies = jiffies + RCU_IDLE_LAZY_GP_DELAY; + *delta_jiffies = jiffies + rcu_idle_lazy_gp_delay; *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; } return 0; @@ -1805,11 +1814,11 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("User dyntick with callbacks"); rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); + round_up(jiffies + rcu_idle_gp_delay, + rcu_idle_gp_delay); } else if (rcu_cpu_has_callbacks(cpu)) { rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); + round_jiffies(jiffies + rcu_idle_lazy_gp_delay); trace_rcu_prep_idle("User dyntick with lazy callbacks"); } else { return; @@ -1861,8 +1870,8 @@ static void rcu_prepare_for_idle(int cpu) /* Check and update the ->dyntick_drain sequencing. */ if (rdtp->dyntick_drain <= 0) { /* First time through, initialize the counter. */ - rdtp->dyntick_drain = RCU_IDLE_FLUSHES; - } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && + rdtp->dyntick_drain = rcu_idle_flushes; + } else if (rdtp->dyntick_drain <= rcu_idle_opt_flushes && !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ @@ -1871,11 +1880,11 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); rdtp->idle_gp_timer_expires = - round_up(jiffies + RCU_IDLE_GP_DELAY, - RCU_IDLE_GP_DELAY); + round_up(jiffies + rcu_idle_gp_delay, + rcu_idle_gp_delay); } else { rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); + round_jiffies(jiffies + rcu_idle_lazy_gp_delay); trace_rcu_prep_idle("Dyntick with lazy callbacks"); } tp = &rdtp->idle_gp_timer; -- cgit v1.2.3 From b11cc5760a9c48c870ad286e8a6d8fdb998fa58d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Mon, 17 Dec 2012 14:21:14 -0800 Subject: rcu: Accelerate RCU callbacks at grace-period end Now that callback acceleration is idempotent, it is safe to accelerate callbacks during grace-period cleanup on any CPUs that the kthread happens to be running on. This commit therefore propagates the completion of the grace period to the per-CPU data structures, and also adds an rcu_advance_cbs() just before the cpu_needs_another_gp() check in order to reduce false-positive grace periods. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 074cb2d974bf..2015bce749f9 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1396,6 +1396,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); rnp->completed = rsp->gpnum; + rdp = this_cpu_ptr(rsp->rda); + if (rnp == rdp->mynode) + __rcu_process_gp_end(rsp, rnp, rdp); nocb += rcu_nocb_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); @@ -1408,6 +1411,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); + rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ if (cpu_needs_another_gp(rsp, rdp)) rsp->gp_flags = 1; raw_spin_unlock_irq(&rnp->lock); @@ -1497,6 +1501,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks! + */ + rcu_advance_cbs(rsp, rnp, rdp); + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* @@ -1509,14 +1522,6 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) return; } - /* - * Because there is no grace period in progress right now, - * any callbacks we have up to this point will be satisfied - * by the next grace period. So this is a good place to - * assign a grace period number to recently posted callbacks. - */ - rcu_accelerate_cbs(rsp, rnp, rdp); - rsp->gp_flags = RCU_GP_FLAG_INIT; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ -- cgit v1.2.3 From c0f4dfd4f90f1667d234d21f15153ea09a2eaa66 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Fri, 28 Dec 2012 11:30:36 -0800 Subject: rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks Because RCU callbacks are now associated with the number of the grace period that they must wait for, CPUs can now take advance callbacks corresponding to grace periods that ended while a given CPU was in dyntick-idle mode. This eliminates the need to try forcing the RCU state machine while entering idle, thus reducing the CPU intensiveness of RCU_FAST_NO_HZ, which should increase its energy efficiency. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- Documentation/kernel-parameters.txt | 28 ++- include/linux/rcupdate.h | 1 + init/Kconfig | 17 +- kernel/rcutree.c | 28 +-- kernel/rcutree.h | 12 +- kernel/rcutree_plugin.h | 374 ++++++++++-------------------------- kernel/rcutree_trace.c | 2 - 7 files changed, 149 insertions(+), 313 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index a17ba16c8fc8..22303b2e74bc 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2490,6 +2490,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted. leaf rcu_node structure. Useful for very large systems. + rcutree.jiffies_till_first_fqs= [KNL,BOOT] + Set delay from grace-period initialization to + first attempt to force quiescent states. + Units are jiffies, minimum value is zero, + and maximum value is HZ. + + rcutree.jiffies_till_next_fqs= [KNL,BOOT] + Set delay between subsequent attempts to force + quiescent states. Units are jiffies, minimum + value is one, and maximum value is HZ. + rcutree.qhimark= [KNL,BOOT] Set threshold of queued RCU callbacks over which batch limiting is disabled. @@ -2504,16 +2515,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. rcutree.rcu_cpu_stall_timeout= [KNL,BOOT] Set timeout for RCU CPU stall warning messages. - rcutree.jiffies_till_first_fqs= [KNL,BOOT] - Set delay from grace-period initialization to - first attempt to force quiescent states. - Units are jiffies, minimum value is zero, - and maximum value is HZ. + rcutree.rcu_idle_gp_delay= [KNL,BOOT] + Set wakeup interval for idle CPUs that have + RCU callbacks (RCU_FAST_NO_HZ=y). - rcutree.jiffies_till_next_fqs= [KNL,BOOT] - Set delay between subsequent attempts to force - quiescent states. Units are jiffies, minimum - value is one, and maximum value is HZ. + rcutree.rcu_idle_lazy_gp_delay= [KNL,BOOT] + Set wakeup interval for idle CPUs that have + only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y). + Lazy RCU callbacks are those which RCU can + prove do nothing more than free memory. rcutorture.fqs_duration= [KNL,BOOT] Set duration of force_quiescent_state bursts. diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index b758ce17b309..9ed2c9a4de45 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -80,6 +80,7 @@ extern void do_trace_rcu_torture_read(char *rcutorturename, #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) +#define ulong2long(a) (*(long *)(&(a))) /* Exported common interfaces */ diff --git a/init/Kconfig b/init/Kconfig index 717584064a7e..a3a2304fa6d2 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -582,13 +582,16 @@ config RCU_FAST_NO_HZ depends on NO_HZ && SMP default n help - This option causes RCU to attempt to accelerate grace periods in - order to allow CPUs to enter dynticks-idle state more quickly. - On the other hand, this option increases the overhead of the - dynticks-idle checking, thus degrading scheduling latency. - - Say Y if energy efficiency is critically important, and you don't - care about real-time response. + This option permits CPUs to enter dynticks-idle state even if + they have RCU callbacks queued, and prevents RCU from waking + these CPUs up more than roughly once every four jiffies (by + default, you can adjust this using the rcutree.rcu_idle_gp_delay + parameter), thus improving energy efficiency. On the other + hand, this option increases the duration of RCU grace periods, + for example, slowing down synchronize_rcu(). + + Say Y if energy efficiency is critically important, and you + don't care about increased grace-period durations. Say N if you are unsure. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2015bce749f9..7b1d7769872a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2640,19 +2640,27 @@ static int rcu_pending(int cpu) } /* - * Check to see if any future RCU-related work will need to be done - * by the current CPU, even if none need be done immediately, returning - * 1 if so. + * Return true if the specified CPU has any callback. If all_lazy is + * non-NULL, store an indication of whether all callbacks are lazy. + * (If there are no callbacks, all of them are deemed to be lazy.) */ -static int rcu_cpu_has_callbacks(int cpu) +static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) { + bool al = true; + bool hc = false; + struct rcu_data *rdp; struct rcu_state *rsp; - /* RCU callbacks either ready or pending? */ - for_each_rcu_flavor(rsp) - if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) - return 1; - return 0; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->qlen != rdp->qlen_lazy) + al = false; + if (rdp->nxtlist) + hc = true; + } + if (all_lazy) + *all_lazy = al; + return hc; } /* @@ -2871,7 +2879,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - rcu_prepare_for_idle_init(cpu); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* Add CPU to rcu_node bitmasks. */ @@ -2945,7 +2952,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, */ for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); - rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b6c2335efbdf..96a27f922e92 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,18 +88,13 @@ struct rcu_dynticks { int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ #ifdef CONFIG_RCU_FAST_NO_HZ - int dyntick_drain; /* Prepare-for-idle state variable. */ - unsigned long dyntick_holdoff; - /* No retries for the jiffy of failure. */ - struct timer_list idle_gp_timer; - /* Wake up CPU sleeping with callbacks. */ - unsigned long idle_gp_timer_expires; - /* When to wake up CPU (for repost). */ - bool idle_first_pass; /* First pass of attempt to go idle? */ + bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; /* # times non-lazy CBs posted to CPU. */ unsigned long nonlazy_posted_snap; /* idle-period nonlazy_posted snapshot. */ + unsigned long last_accelerate; + /* Last jiffy CBs were accelerated. */ int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; @@ -521,7 +516,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_BOOST */ static void __cpuinit rcu_prepare_kthreads(int cpu); -static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); static void rcu_idle_count_callbacks_posted(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 28185ad18df3..d318f9f18be5 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1543,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; - return rcu_cpu_has_callbacks(cpu); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ + return rcu_cpu_has_callbacks(cpu, NULL); } /* @@ -1587,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void) * * The following three proprocessor symbols control this state machine: * - * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt - * to satisfy RCU. Beyond this point, it is better to incur a periodic - * scheduling-clock interrupt than to loop through the state machine - * at full power. - * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are - * optional if RCU does not need anything immediately from this - * CPU, even if this CPU still has RCU callbacks queued. The first - * times through the state machine are mandatory: we need to give - * the state machine a chance to communicate a quiescent state - * to the RCU core. * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted * to sleep in dyntick-idle mode with RCU callbacks pending. This * is sized to be roughly one RCU grace period. Those energy-efficiency @@ -1612,15 +1595,9 @@ static void rcu_idle_count_callbacks_posted(void) * adjustment, they can be converted into kernel config parameters, though * making the state machine smarter might be a better option. */ -#define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ -#define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -static int rcu_idle_flushes = RCU_IDLE_FLUSHES; -module_param(rcu_idle_flushes, int, 0644); -static int rcu_idle_opt_flushes = RCU_IDLE_OPT_FLUSHES; -module_param(rcu_idle_opt_flushes, int, 0644); static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; @@ -1629,178 +1606,97 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644); extern int tick_nohz_enabled; /* - * Does the specified flavor of RCU have non-lazy callbacks pending on - * the specified CPU? Both RCU flavor and CPU are specified by the - * rcu_data structure. - */ -static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) -{ - return rdp->qlen != rdp->qlen_lazy; -} - -#ifdef CONFIG_TREE_PREEMPT_RCU - -/* - * Are there non-lazy RCU-preempt callbacks? (There cannot be if there - * is no RCU-preempt in the kernel.) + * Try to advance callbacks for all flavors of RCU on the current CPU. + * Afterwards, if there are any callbacks ready for immediate invocation, + * return true. */ -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +static bool rcu_try_advance_all_cbs(void) { - struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - - return __rcu_cpu_has_nonlazy_callbacks(rdp); -} - -#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + bool cbs_ready = false; + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp; -static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) -{ - return 0; -} + for_each_rcu_flavor(rsp) { + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; -#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + /* + * Don't bother checking unless a grace period has + * completed since we last checked and there are + * callbacks not yet ready to invoke. + */ + if (rdp->completed != rnp->completed && + rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_process_gp_end(rsp, rdp); -/* - * Does any flavor of RCU have non-lazy callbacks on the specified CPU? - */ -static bool rcu_cpu_has_nonlazy_callbacks(int cpu) -{ - return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || - __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || - rcu_preempt_cpu_has_nonlazy_callbacks(cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + cbs_ready = true; + } + return cbs_ready; } /* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! + * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready + * to invoke. If the CPU has callbacks, try to advance them. Tell the + * caller to set the timeout based on whether or not there are non-lazy + * callbacks. * - * The delta_jiffies argument is used to store the time when RCU is - * going to need the CPU again if it still has callbacks. The reason - * for this is that rcu_prepare_for_idle() might need to post a timer, - * but if so, it will do so after tick_nohz_stop_sched_tick() has set - * the wakeup time for this CPU. This means that RCU's timer can be - * delayed until the wakeup time, which defeats the purpose of posting - * a timer. + * The caller must have disabled interrupts. */ -int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +int rcu_needs_cpu(int cpu, unsigned long *dj) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; + /* Snapshot to detect later posting of non-lazy callback. */ + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; + /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - *delta_jiffies = ULONG_MAX; + if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { + *dj = ULONG_MAX; return 0; } - if (rdtp->dyntick_holdoff == jiffies) { - /* RCU recently tried and failed, so don't try again. */ - *delta_jiffies = 1; + + /* Attempt to advance callbacks. */ + if (rcu_try_advance_all_cbs()) { + /* Some ready to invoke, so initiate later invocation. */ + invoke_rcu_core(); return 1; } - /* Set up for the possibility that RCU will post a timer. */ - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - *delta_jiffies = round_up(rcu_idle_gp_delay + jiffies, - rcu_idle_gp_delay) - jiffies; + rdtp->last_accelerate = jiffies; + + /* Request timer delay depending on laziness, and round. */ + if (rdtp->all_lazy) { + *dj = round_up(rcu_idle_gp_delay + jiffies, + rcu_idle_gp_delay) - jiffies; } else { - *delta_jiffies = jiffies + rcu_idle_lazy_gp_delay; - *delta_jiffies = round_jiffies(*delta_jiffies) - jiffies; + *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } return 0; } /* - * Handler for smp_call_function_single(). The only point of this - * handler is to wake the CPU up, so the handler does only tracing. - */ -void rcu_idle_demigrate(void *unused) -{ - trace_rcu_prep_idle("Demigrate"); -} - -/* - * Timer handler used to force CPU to start pushing its remaining RCU - * callbacks in the case where it entered dyntick-idle mode with callbacks - * pending. The hander doesn't really need to do anything because the - * real work is done upon re-entry to idle, or by the next scheduling-clock - * interrupt should idle not be re-entered. - * - * One special case: the timer gets migrated without awakening the CPU - * on which the timer was scheduled on. In this case, we must wake up - * that CPU. We do so with smp_call_function_single(). - */ -static void rcu_idle_gp_timer_func(unsigned long cpu_in) -{ - int cpu = (int)cpu_in; - - trace_rcu_prep_idle("Timer"); - if (cpu != smp_processor_id()) - smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0); - else - WARN_ON_ONCE(1); /* Getting here can hang the system... */ -} - -/* - * Initialize the timer used to pull CPUs out of dyntick-idle mode. - */ -static void rcu_prepare_for_idle_init(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - rdtp->dyntick_holdoff = jiffies - 1; - setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); - rdtp->idle_gp_timer_expires = jiffies - 1; - rdtp->idle_first_pass = 1; -} - -/* - * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to ->idle_gp_timer, so cancel it. This will - * do nothing if this timer is not active, so just cancel it unconditionally. - */ -static void rcu_cleanup_after_idle(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - del_timer(&rdtp->idle_gp_timer); - trace_rcu_prep_idle("Cleanup after idle"); - rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled); -} - -/* - * Check to see if any RCU-related work can be done by the current CPU, - * and if so, schedule a softirq to get it done. This function is part - * of the RCU implementation; it is -not- an exported member of the RCU API. - * - * The idea is for the current CPU to clear out all work required by the - * RCU core for the current grace period, so that this CPU can be permitted - * to enter dyntick-idle mode. In some cases, it will need to be awakened - * at the end of the grace period by whatever CPU ends the grace period. - * This allows CPUs to go dyntick-idle more quickly, and to reduce the - * number of wakeups by a modest integer factor. - * - * Because it is not legal to invoke rcu_process_callbacks() with irqs - * disabled, we do one pass of force_quiescent_state(), then do a - * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The ->dyntick_drain field controls the sequencing. + * Prepare a CPU for idle from an RCU perspective. The first major task + * is to sense whether nohz mode has been enabled or disabled via sysfs. + * The second major task is to check to see if a non-lazy callback has + * arrived at a CPU that previously had only lazy callbacks. The third + * major task is to accelerate (that is, assign grace-period numbers to) + * any recently arrived callbacks. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { - struct timer_list *tp; + struct rcu_data *rdp; struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct rcu_node *rnp; + struct rcu_state *rsp; int tne; /* Handle nohz enablement switches conservatively. */ tne = ACCESS_ONCE(tick_nohz_enabled); if (tne != rdtp->tick_nohz_enabled_snap) { - if (rcu_cpu_has_callbacks(cpu)) + if (rcu_cpu_has_callbacks(cpu, NULL)) invoke_rcu_core(); /* force nohz to see update. */ rdtp->tick_nohz_enabled_snap = tne; return; @@ -1808,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu) if (!tne) return; - /* Adaptive-tick mode, where usermode execution is idle to RCU. */ - if (!is_idle_task(current)) { - rdtp->dyntick_holdoff = jiffies - 1; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("User dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + rcu_idle_gp_delay, - rcu_idle_gp_delay); - } else if (rcu_cpu_has_callbacks(cpu)) { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + rcu_idle_lazy_gp_delay); - trace_rcu_prep_idle("User dyntick with lazy callbacks"); - } else { - return; - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + /* If this is a no-CBs CPU, no callbacks, just return. */ + if (is_nocb_cpu(cpu)) return; - } /* - * If this is an idle re-entry, for example, due to use of - * RCU_NONIDLE() or the new idle-loop tracing API within the idle - * loop, then don't take any state-machine actions, unless the - * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the ->idle_gp_timer if this CPU has callbacks - * pending. + * If a non-lazy callback arrived at a CPU having only lazy + * callbacks, invoke RCU core for the side-effect of recalculating + * idle duration on re-entry to idle. */ - if (!rdtp->idle_first_pass && - (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { - if (rcu_cpu_has_callbacks(cpu)) { - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - } + if (rdtp->all_lazy && + rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { + invoke_rcu_core(); return; } - rdtp->idle_first_pass = 0; - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* - * If there are no callbacks on this CPU, enter dyntick-idle mode. - * Also reset state to avoid prejudicing later attempts. + * If we have not yet accelerated this jiffy, accelerate all + * callbacks on this CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) { - rdtp->dyntick_holdoff = jiffies - 1; - rdtp->dyntick_drain = 0; - trace_rcu_prep_idle("No callbacks"); + if (rdtp->last_accelerate == jiffies) return; + rdtp->last_accelerate = jiffies; + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (!*rdp->nxttail[RCU_DONE_TAIL]) + continue; + rnp = rdp->mynode; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_accelerate_cbs(rsp, rnp, rdp); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } +} - /* - * If in holdoff mode, just return. We will presumably have - * refrained from disabling the scheduling-clock tick. - */ - if (rdtp->dyntick_holdoff == jiffies) { - trace_rcu_prep_idle("In holdoff"); - return; - } +/* + * Clean up for exit from idle. Attempt to advance callbacks based on + * any grace periods that elapsed while the CPU was idle, and if any + * callbacks are now ready to invoke, initiate invocation. + */ +static void rcu_cleanup_after_idle(int cpu) +{ + struct rcu_data *rdp; + struct rcu_state *rsp; - /* Check and update the ->dyntick_drain sequencing. */ - if (rdtp->dyntick_drain <= 0) { - /* First time through, initialize the counter. */ - rdtp->dyntick_drain = rcu_idle_flushes; - } else if (rdtp->dyntick_drain <= rcu_idle_opt_flushes && - !rcu_pending(cpu) && - !local_softirq_pending()) { - /* Can we go dyntick-idle despite still having callbacks? */ - rdtp->dyntick_drain = 0; - rdtp->dyntick_holdoff = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) { - trace_rcu_prep_idle("Dyntick with callbacks"); - rdtp->idle_gp_timer_expires = - round_up(jiffies + rcu_idle_gp_delay, - rcu_idle_gp_delay); - } else { - rdtp->idle_gp_timer_expires = - round_jiffies(jiffies + rcu_idle_lazy_gp_delay); - trace_rcu_prep_idle("Dyntick with lazy callbacks"); - } - tp = &rdtp->idle_gp_timer; - mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); - rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; - return; /* Nothing more to do immediately. */ - } else if (--(rdtp->dyntick_drain) <= 0) { - /* We have hit the limit, so time to give up. */ - rdtp->dyntick_holdoff = jiffies; - trace_rcu_prep_idle("Begin holdoff"); - invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ + if (is_nocb_cpu(cpu)) return; - } - - /* - * Do one step of pushing the remaining RCU callbacks through - * the RCU core state machine. - */ -#ifdef CONFIG_TREE_PREEMPT_RCU - if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - rcu_preempt_qs(cpu); - force_quiescent_state(&rcu_preempt_state); - } -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - if (per_cpu(rcu_sched_data, cpu).nxtlist) { - rcu_sched_qs(cpu); - force_quiescent_state(&rcu_sched_state); - } - if (per_cpu(rcu_bh_data, cpu).nxtlist) { - rcu_bh_qs(cpu); - force_quiescent_state(&rcu_bh_state); - } - - /* - * If RCU callbacks are still pending, RCU still needs this CPU. - * So try forcing the callbacks through the grace period. - */ - if (rcu_cpu_has_callbacks(cpu)) { - trace_rcu_prep_idle("More callbacks"); - invoke_rcu_core(); - } else { - trace_rcu_prep_idle("Callbacks drained"); + rcu_try_advance_all_cbs(); + for_each_rcu_flavor(rsp) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); } } @@ -2034,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier); static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - struct timer_list *tltp = &rdtp->idle_gp_timer; - char c; + unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; - c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; - if (timer_pending(tltp)) - sprintf(cp, "drain=%d %c timer=%lu", - rdtp->dyntick_drain, c, tltp->expires - jiffies); - else - sprintf(cp, "drain=%d %c timer not pending", - rdtp->dyntick_drain, c); + sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", + rdtp->last_accelerate & 0xffff, jiffies & 0xffff, + ulong2long(nlpd), + rdtp->all_lazy ? 'L' : '.', + rdtp->tick_nohz_enabled_snap ? '.' : 'D'); } #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 0d095dcaa670..49099e81c87b 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -46,8 +46,6 @@ #define RCU_TREE_NONCORE #include "rcutree.h" -#define ulong2long(a) (*(long *)(&(a))) - static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) { -- cgit v1.2.3 From b92db6cb7efcbd41e469e1d757c47da4865f7622 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sat, 29 Dec 2012 14:52:41 -0800 Subject: rcu: Rearrange locking in rcu_start_gp() If CPUs are to give prior notice of needed grace periods, it will be necessary to invoke rcu_start_gp() without dropping the root rcu_node structure's ->lock. This commit takes a first step in this direction by moving the release of this lock to the end of rcu_start_gp(). Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7b1d7769872a..2c6a9314c7f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1521,16 +1521,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->gp_flags = RCU_GP_FLAG_INIT; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ /* Ensure that CPU is aware of completion of last grace period. */ - rcu_process_gp_end(rsp, rdp); - local_irq_restore(flags); + __rcu_process_gp_end(rsp, rdp->mynode, rdp); /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* -- cgit v1.2.3 From bd9f0686fc8c9a01c6850b1c611d1c9ad80b86d6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sat, 29 Dec 2012 21:51:20 -0800 Subject: rcu: Repurpose no-CBs event tracing to future-GP events Dyntick-idle CPUs need to be able to pre-announce their need for grace periods. This can be done using something similar to the mechanism used by no-CB CPUs to announce their need for grace periods. This commit moves in this direction by renaming the no-CBs grace-period event tracing to suit the new future-grace-period needs. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- include/trace/events/rcu.h | 16 ++++++------ kernel/rcutree_plugin.h | 62 ++++++++++++++++++++++++---------------------- 2 files changed, 40 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index cdfed6d386eb..59ebcc89f148 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -72,10 +72,10 @@ TRACE_EVENT(rcu_grace_period, ); /* - * Tracepoint for no-callbacks grace-period events. The caller should - * pull the data from the rcu_node structure, other than rcuname, which - * comes from the rcu_state structure, and event, which is one of the - * following: + * Tracepoint for future grace-period events, including those for no-callbacks + * CPUs. The caller should pull the data from the rcu_node structure, + * other than rcuname, which comes from the rcu_state structure, and event, + * which is one of the following: * * "Startleaf": Request a nocb grace period based on leaf-node data. * "Startedleaf": Leaf-node start proved sufficient. @@ -87,7 +87,7 @@ TRACE_EVENT(rcu_grace_period, * "Cleanup": Clean up rcu_node structure after previous GP. * "CleanupMore": Clean up, and another no-CB GP is needed. */ -TRACE_EVENT(rcu_nocb_grace_period, +TRACE_EVENT(rcu_future_grace_period, TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed, unsigned long c, u8 level, int grplo, int grphi, @@ -653,9 +653,9 @@ TRACE_EVENT(rcu_barrier, #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) #define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ qsmask) do { } while (0) -#define trace_rcu_nocb_grace_period(rcuname, gpnum, completed, c, \ - level, grplo, grphi, event) \ - do { } while (0) +#define trace_rcu_future_grace_period(rcuname, gpnum, completed, c, \ + level, grplo, grphi, event) \ + do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) #define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index d318f9f18be5..df50502eca2c 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2034,9 +2034,9 @@ static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) wake_up_all(&rnp->nocb_gp_wq[c & 0x1]); rnp->n_nocb_gp_requests[c & 0x1] = 0; needmore = rnp->n_nocb_gp_requests[(c + 1) & 0x1]; - trace_rcu_nocb_grace_period(rsp->name, rnp->gpnum, rnp->completed, - c, rnp->level, rnp->grplo, rnp->grphi, - needmore ? "CleanupMore" : "Cleanup"); + trace_rcu_future_grace_period(rsp->name, rnp->gpnum, rnp->completed, + c, rnp->level, rnp->grplo, rnp->grphi, + needmore ? "CleanupMore" : "Cleanup"); return needmore; } @@ -2183,9 +2183,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) /* Count our request for a grace period. */ rnp->n_nocb_gp_requests[c & 0x1]++; - trace_rcu_nocb_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, - c, rnp->level, rnp->grplo, rnp->grphi, - "Startleaf"); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, "Startleaf"); if (rnp->gpnum != rnp->completed) { @@ -2194,10 +2194,10 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * is in progress, so we are done. When this grace * period ends, our request will be acted upon. */ - trace_rcu_nocb_grace_period(rdp->rsp->name, - rnp->gpnum, rnp->completed, c, - rnp->level, rnp->grplo, rnp->grphi, - "Startedleaf"); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, + "Startedleaf"); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { @@ -2209,11 +2209,12 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (rnp != rnp_root) raw_spin_lock(&rnp_root->lock); /* irqs disabled. */ if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_nocb_grace_period(rdp->rsp->name, - rnp->gpnum, rnp->completed, - c, rnp->level, - rnp->grplo, rnp->grphi, - "Startedleafroot"); + trace_rcu_future_grace_period(rdp->rsp->name, + rnp->gpnum, + rnp->completed, + c, rnp->level, + rnp->grplo, rnp->grphi, + "Startedleafroot"); raw_spin_unlock(&rnp_root->lock); /* irqs disabled. */ } else { @@ -2229,11 +2230,12 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) c = rnp_root->completed + 1; rnp->n_nocb_gp_requests[c & 0x1]++; rnp_root->n_nocb_gp_requests[c & 0x1]++; - trace_rcu_nocb_grace_period(rdp->rsp->name, - rnp->gpnum, rnp->completed, - c, rnp->level, - rnp->grplo, rnp->grphi, - "Startedroot"); + trace_rcu_future_grace_period(rdp->rsp->name, + rnp->gpnum, + rnp->completed, + c, rnp->level, + rnp->grplo, rnp->grphi, + "Startedroot"); local_save_flags(flags1); rcu_start_gp(rdp->rsp, flags1); /* Rlses ->lock. */ } @@ -2249,9 +2251,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_nocb_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, - c, rnp->level, rnp->grplo, rnp->grphi, - "StartWait"); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, "StartWait"); for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2259,14 +2261,14 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; flush_signals(current); - trace_rcu_nocb_grace_period(rdp->rsp->name, - rnp->gpnum, rnp->completed, c, - rnp->level, rnp->grplo, rnp->grphi, - "ResumeWait"); + trace_rcu_future_grace_period(rdp->rsp->name, + rnp->gpnum, rnp->completed, c, + rnp->level, rnp->grplo, + rnp->grphi, "ResumeWait"); } - trace_rcu_nocb_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, - c, rnp->level, rnp->grplo, rnp->grphi, - "EndWait"); + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, "EndWait"); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } -- cgit v1.2.3 From b8462084a2a88a6a0489f9bb7d8b1bb95bc455ab Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sat, 29 Dec 2012 22:04:18 -0800 Subject: rcu: Push lock release to rcu_start_gp()'s callers If CPUs are to give prior notice of needed grace periods, it will be necessary to invoke rcu_start_gp() without dropping the root rcu_node structure's ->lock. This commit takes a second step in this direction by moving the release of this lock to rcu_start_gp()'s callers. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 24 ++++++++++-------------- kernel/rcutree_plugin.h | 5 ++--- 2 files changed, 12 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 2c6a9314c7f7..0d532950baa3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1487,16 +1487,14 @@ static int __noreturn rcu_gp_kthread(void *arg) /* * Start a new RCU grace period if warranted, re-initializing the hierarchy * in preparation for detecting the next grace period. The caller must hold - * the root node's ->lock, which is released before return. Hard irqs must - * be disabled. + * the root node's ->lock and hard irqs must be disabled. * * Note that it is legal for a dying CPU (which is marked as offline) to * invoke this function. This can happen when the dying CPU reports its * quiescent state. */ static void -rcu_start_gp(struct rcu_state *rsp, unsigned long flags) - __releases(rcu_get_root(rsp)->lock) +rcu_start_gp(struct rcu_state *rsp) { struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); @@ -1510,15 +1508,13 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) */ rcu_advance_cbs(rsp, rnp, rdp); - if (!rsp->gp_kthread || - !cpu_needs_another_gp(rsp, rdp)) { + if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period * task, this CPU does not need another grace period, * or a grace period is already in progress. * Either way, don't start a new grace period. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } rsp->gp_flags = RCU_GP_FLAG_INIT; @@ -1528,15 +1524,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); - raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* * Report a full set of quiescent states to the specified rcu_state * data structure. This involves cleaning up after the prior grace * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, as - * required by rcu_start_gp(), which will release it. + * if one is needed. Note that the caller must hold rnp->lock, which + * is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) @@ -2134,7 +2129,8 @@ __rcu_process_callbacks(struct rcu_state *rsp) local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ - rcu_start_gp(rsp, flags); /* releases above lock */ + rcu_start_gp(rsp); + raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); } else { local_irq_restore(flags); } @@ -2214,11 +2210,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Start a new grace period if one not already started. */ if (!rcu_gp_in_progress(rsp)) { - unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp_root->lock, nestflag); - rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + raw_spin_lock(&rnp_root->lock); + rcu_start_gp(rsp); + raw_spin_unlock(&rnp_root->lock); } else { /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index df50502eca2c..073ded26e259 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2174,7 +2174,6 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) unsigned long c; bool d; unsigned long flags; - unsigned long flags1; struct rcu_node *rnp = rdp->mynode; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); @@ -2236,8 +2235,8 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) c, rnp->level, rnp->grplo, rnp->grphi, "Startedroot"); - local_save_flags(flags1); - rcu_start_gp(rdp->rsp, flags1); /* Rlses ->lock. */ + rcu_start_gp(rdp->rsp); + raw_spin_unlock(&rnp->lock); } /* Clean up locking and irq state. */ -- cgit v1.2.3 From 8b425aa8f1acfe48aed919c7aadff2ed290fe969 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sun, 30 Dec 2012 13:06:35 -0800 Subject: rcu: Rename n_nocb_gp_requests to need_future_gp CPUs going idle need to be able to indicate their need for future grace periods. A mechanism for doing this already exists for no-callbacks CPUs, so the idea is to re-use that mechanism. This commit therefore moves the ->n_nocb_gp_requests field of the rcu_node structure out from under the CONFIG_RCU_NOCB_CPU #ifdef and renames it to ->need_future_gp. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.h | 4 ++-- kernel/rcutree_plugin.h | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 96a27f922e92..034b524594bd 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -194,9 +194,9 @@ struct rcu_node { #ifdef CONFIG_RCU_NOCB_CPU wait_queue_head_t nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ - int n_nocb_gp_requests[2]; - /* Counts of upcoming no-CB GP requests. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ + int need_future_gp[2]; + /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 073ded26e259..f3f0020b5b58 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2018,7 +2018,7 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp) { struct rcu_node *rnp = rcu_get_root(rsp); - return rnp->n_nocb_gp_requests[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; + return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; } /* @@ -2032,8 +2032,8 @@ static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; wake_up_all(&rnp->nocb_gp_wq[c & 0x1]); - rnp->n_nocb_gp_requests[c & 0x1] = 0; - needmore = rnp->n_nocb_gp_requests[(c + 1) & 0x1]; + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_grace_period(rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, rnp->grplo, rnp->grphi, needmore ? "CleanupMore" : "Cleanup"); @@ -2041,7 +2041,7 @@ static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* - * Set the root rcu_node structure's ->n_nocb_gp_requests field + * Set the root rcu_node structure's ->need_future_gp field * based on the sum of those of all rcu_node structures. This does * double-count the root rcu_node structure's requests, but this * is necessary to handle the possibility of a rcu_nocb_kthread() @@ -2050,7 +2050,7 @@ static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) */ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { - rnp->n_nocb_gp_requests[(rnp->completed + 1) & 0x1] += nrq; + rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } static void rcu_init_one_nocb(struct rcu_node *rnp) @@ -2181,7 +2181,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) c = rnp->completed + 2; /* Count our request for a grace period. */ - rnp->n_nocb_gp_requests[c & 0x1]++; + rnp->need_future_gp[c & 0x1]++; trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, rnp->grplo, rnp->grphi, "Startleaf"); @@ -2225,10 +2225,10 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Adjust counters accordingly and start the * needed grace period. */ - rnp->n_nocb_gp_requests[c & 0x1]--; + rnp->need_future_gp[c & 0x1]--; c = rnp_root->completed + 1; - rnp->n_nocb_gp_requests[c & 0x1]++; - rnp_root->n_nocb_gp_requests[c & 0x1]++; + rnp->need_future_gp[c & 0x1]++; + rnp_root->need_future_gp[c & 0x1]++; trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, -- cgit v1.2.3 From 0446be489795d8bb994125a916ef03211f539e54 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Sun, 30 Dec 2012 15:21:01 -0800 Subject: rcu: Abstract rcu_start_future_gp() from rcu_nocb_wait_gp() CPUs going idle will need to record the need for a future grace period, but won't actually need to block waiting on it. This commit therefore splits rcu_start_future_gp(), which does the recording, from rcu_nocb_wait_gp(), which now invokes rcu_start_future_gp() to do the recording, after which rcu_nocb_wait_gp() does the waiting. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcutree.h | 2 +- kernel/rcutree_plugin.h | 104 ++++------------------------------------ 3 files changed, 130 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0d532950baa3..f4b23f16677a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -224,6 +224,7 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); +static void rcu_start_gp(struct rcu_state *rsp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -1074,6 +1075,120 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, return rnp->completed + 2; } +/* + * Trace-event helper function for rcu_start_future_gp() and + * rcu_nocb_wait_gp(). + */ +static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, + unsigned long c, char *s) +{ + trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, + rnp->completed, c, rnp->level, + rnp->grplo, rnp->grphi, s); +} + +/* + * Start some future grace period, as needed to handle newly arrived + * callbacks. The required future grace periods are recorded in each + * rcu_node structure's ->need_future_gp field. + * + * The caller must hold the specified rcu_node structure's ->lock. + */ +static unsigned long __maybe_unused +rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) +{ + unsigned long c; + int i; + struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); + + /* + * Pick up grace-period number for new callbacks. If this + * grace period is already marked as needed, return to the caller. + */ + c = rcu_cbs_completed(rdp->rsp, rnp); + trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + if (rnp->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + return c; + } + + /* + * If either this rcu_node structure or the root rcu_node structure + * believe that a grace period is in progress, then we must wait + * for the one following, which is in "c". Because our request + * will be noticed at the end of the current grace period, we don't + * need to explicitly start one. + */ + if (rnp->gpnum != rnp->completed || + ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { + rnp->need_future_gp[c & 0x1]++; + trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + return c; + } + + /* + * There might be no grace period in progress. If we don't already + * hold it, acquire the root rcu_node structure's lock in order to + * start one (if needed). + */ + if (rnp != rnp_root) + raw_spin_lock(&rnp_root->lock); + + /* + * Get a new grace-period number. If there really is no grace + * period in progress, it will be smaller than the one we obtained + * earlier. Adjust callbacks as needed. Note that even no-CBs + * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + */ + c = rcu_cbs_completed(rdp->rsp, rnp_root); + for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) + if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) + rdp->nxtcompleted[i] = c; + + /* + * If the needed for the required grace period is already + * recorded, trace and leave. + */ + if (rnp_root->need_future_gp[c & 0x1]) { + trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + goto unlock_out; + } + + /* Record the need for the future grace period. */ + rnp_root->need_future_gp[c & 0x1]++; + + /* If a grace period is not already in progress, start one. */ + if (rnp_root->gpnum != rnp_root->completed) { + trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + } else { + trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + rcu_start_gp(rdp->rsp); + } +unlock_out: + if (rnp != rnp_root) + raw_spin_unlock(&rnp_root->lock); + return c; +} + +/* + * Clean up any old requests for the just-ended grace period. Also return + * whether any additional grace periods have been requested. Also invoke + * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads + * waiting for this grace period to complete. + */ +static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +{ + int c = rnp->completed; + int needmore; + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + + rcu_nocb_gp_cleanup(rsp, rnp); + rnp->need_future_gp[c & 0x1] = 0; + needmore = rnp->need_future_gp[(c + 1) & 0x1]; + trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + return needmore; +} + /* * If there is room, assign a ->completed number to any callbacks on * this CPU that have not already been assigned. Also accelerate any @@ -1312,9 +1427,9 @@ static int rcu_gp_init(struct rcu_state *rsp) rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; + ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; WARN_ON_ONCE(rnp->completed != rsp->completed); - rnp->completed = rsp->completed; + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); @@ -1395,11 +1510,11 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); - rnp->completed = rsp->gpnum; + ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) __rcu_process_gp_end(rsp, rnp, rdp); - nocb += rcu_nocb_gp_cleanup(rsp, rnp); + nocb += rcu_future_gp_cleanup(rsp, rnp); raw_spin_unlock_irq(&rnp->lock); cond_resched(); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 034b524594bd..c11b46c33159 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -526,7 +526,7 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static int rcu_nocb_needs_gp(struct rcu_state *rsp); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool is_nocb_cpu(int cpu); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f3f0020b5b58..723af5f707f0 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2022,22 +2022,12 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp) } /* - * Clean up this rcu_node structure's no-CBs state at the end of - * a grace period, and also return whether any no-CBs CPU associated - * with this rcu_node structure needs another grace period. + * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended + * grace period. */ -static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - int c = rnp->completed; - int needmore; - - wake_up_all(&rnp->nocb_gp_wq[c & 0x1]); - rnp->need_future_gp[c & 0x1] = 0; - needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_grace_period(rsp->name, rnp->gpnum, rnp->completed, - c, rnp->level, rnp->grplo, rnp->grphi, - needmore ? "CleanupMore" : "Cleanup"); - return needmore; + wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); } /* @@ -2175,84 +2165,16 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool d; unsigned long flags; struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); raw_spin_lock_irqsave(&rnp->lock, flags); - c = rnp->completed + 2; - - /* Count our request for a grace period. */ - rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, "Startleaf"); - - if (rnp->gpnum != rnp->completed) { - - /* - * This rcu_node structure believes that a grace period - * is in progress, so we are done. When this grace - * period ends, our request will be acted upon. - */ - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, - "Startedleaf"); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - - } else { - - /* - * Might not be a grace period, check root rcu_node - * structure to see if we must start one. - */ - if (rnp != rnp_root) - raw_spin_lock(&rnp_root->lock); /* irqs disabled. */ - if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_grace_period(rdp->rsp->name, - rnp->gpnum, - rnp->completed, - c, rnp->level, - rnp->grplo, rnp->grphi, - "Startedleafroot"); - raw_spin_unlock(&rnp_root->lock); /* irqs disabled. */ - } else { - - /* - * No grace period, so we need to start one. - * The good news is that we can wait for exactly - * one grace period instead of part of the current - * grace period and all of the next grace period. - * Adjust counters accordingly and start the - * needed grace period. - */ - rnp->need_future_gp[c & 0x1]--; - c = rnp_root->completed + 1; - rnp->need_future_gp[c & 0x1]++; - rnp_root->need_future_gp[c & 0x1]++; - trace_rcu_future_grace_period(rdp->rsp->name, - rnp->gpnum, - rnp->completed, - c, rnp->level, - rnp->grplo, rnp->grphi, - "Startedroot"); - rcu_start_gp(rdp->rsp); - raw_spin_unlock(&rnp->lock); - } - - /* Clean up locking and irq state. */ - if (rnp != rnp_root) - raw_spin_unlock_irqrestore(&rnp->lock, flags); - else - local_irq_restore(flags); - } + c = rcu_start_future_gp(rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, "StartWait"); + trace_rcu_future_gp(rnp, rdp, c, "StartWait"); for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2260,14 +2182,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; flush_signals(current); - trace_rcu_future_grace_period(rdp->rsp->name, - rnp->gpnum, rnp->completed, c, - rnp->level, rnp->grplo, - rnp->grphi, "ResumeWait"); + trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); } - trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, - rnp->completed, c, rnp->level, - rnp->grplo, rnp->grphi, "EndWait"); + trace_rcu_future_gp(rnp, rdp, c, "EndWait"); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } @@ -2375,9 +2292,8 @@ static int rcu_nocb_needs_gp(struct rcu_state *rsp) return 0; } -static int rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { - return 0; } static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) -- cgit v1.2.3 From 910ee45db2f4837c8440e770474758493ab94bf7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paul.mckenney@linaro.org> Date: Mon, 31 Dec 2012 02:24:21 -0800 Subject: rcu: Make rcu_accelerate_cbs() note need for future grace periods Now that rcu_start_future_gp() has been abstracted from rcu_nocb_wait_gp(), rcu_accelerate_cbs() can invoke rcu_start_future_gp() so as to register the need for any future grace periods needed by a CPU about to enter dyntick-idle mode. This commit makes this change. Note that some refactoring of rcu_start_gp() is carried out to avoid recursion and subsequent self-deadlocks. Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> --- kernel/rcutree.c | 50 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f4b23f16677a..9cb91e4885af 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -224,7 +224,8 @@ static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; module_param(jiffies_till_first_fqs, ulong, 0644); module_param(jiffies_till_next_fqs, ulong, 0644); -static void rcu_start_gp(struct rcu_state *rsp); +static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp); static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -1162,7 +1163,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); } else { trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); - rcu_start_gp(rdp->rsp); + rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: if (rnp != rnp_root) @@ -1248,6 +1249,8 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtcompleted[i] = c; } + /* Record any needed additional grace periods. */ + rcu_start_future_gp(rnp, rdp); /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) @@ -1609,20 +1612,9 @@ static int __noreturn rcu_gp_kthread(void *arg) * quiescent state. */ static void -rcu_start_gp(struct rcu_state *rsp) +rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp) { - struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - struct rcu_node *rnp = rcu_get_root(rsp); - - /* - * If there is no grace period in progress right now, any - * callbacks we have up to this point will be satisfied by the - * next grace period. Also, advancing the callbacks reduces the - * probability of false positives from cpu_needs_another_gp() - * resulting in pointless grace periods. So, advance callbacks! - */ - rcu_advance_cbs(rsp, rnp, rdp); - if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { /* * Either we have not yet spawned the grace-period @@ -1634,13 +1626,35 @@ rcu_start_gp(struct rcu_state *rsp) } rsp->gp_flags = RCU_GP_FLAG_INIT; - /* Ensure that CPU is aware of completion of last grace period. */ - __rcu_process_gp_end(rsp, rdp->mynode, rdp); - /* Wake up rcu_gp_kthread() to start the grace period. */ wake_up(&rsp->gp_wq); } +/* + * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's + * callbacks. Note that rcu_start_gp_advanced() cannot do this because it + * is invoked indirectly from rcu_advance_cbs(), which would result in + * endless recursion -- or would do so if it wasn't for the self-deadlock + * that is encountered beforehand. + */ +static void +rcu_start_gp(struct rcu_state *rsp) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * If there is no grace period in progress right now, any + * callbacks we have up to this point will be satisfied by the + * next grace period. Also, advancing the callbacks reduces the + * probability of false positives from cpu_needs_another_gp() + * resulting in pointless grace periods. So, advance callbacks + * then start the grace period! + */ + rcu_advance_cbs(rsp, rnp, rdp); + rcu_start_gp_advanced(rsp, rnp, rdp); +} + /* * Report a full set of quiescent states to the specified rcu_state * data structure. This involves cleaning up after the prior grace -- cgit v1.2.3 From 84cc8fd2fe65866e49d70b38b3fdf7219dd92fe0 Mon Sep 17 00:00:00 2001 From: Michael Bohan <mbohan@codeaurora.org> Date: Tue, 19 Mar 2013 19:19:25 -0700 Subject: hrtimer: Don't reinitialize a cpu_base lock on CPU_UP The current code makes the assumption that a cpu_base lock won't be held if the CPU corresponding to that cpu_base is offline, which isn't always true. If a hrtimer is not queued, then it will not be migrated by migrate_hrtimers() when a CPU is offlined. Therefore, the hrtimer's cpu_base may still point to a CPU which has subsequently gone offline if the timer wasn't enqueued at the time the CPU went down. Normally this wouldn't be a problem, but a cpu_base's lock is blindly reinitialized each time a CPU is brought up. If a CPU is brought online during the period that another thread is performing a hrtimer operation on a stale hrtimer, then the lock will be reinitialized under its feet, and a SPIN_BUG() like the following will be observed: <0>[ 28.082085] BUG: spinlock already unlocked on CPU#0, swapper/0/0 <0>[ 28.087078] lock: 0xc4780b40, value 0x0 .magic: dead4ead, .owner: <none>/-1, .owner_cpu: -1 <4>[ 42.451150] [<c0014398>] (unwind_backtrace+0x0/0x120) from [<c0269220>] (do_raw_spin_unlock+0x44/0xdc) <4>[ 42.460430] [<c0269220>] (do_raw_spin_unlock+0x44/0xdc) from [<c071b5bc>] (_raw_spin_unlock+0x8/0x30) <4>[ 42.469632] [<c071b5bc>] (_raw_spin_unlock+0x8/0x30) from [<c00a9ce0>] (__hrtimer_start_range_ns+0x1e4/0x4f8) <4>[ 42.479521] [<c00a9ce0>] (__hrtimer_start_range_ns+0x1e4/0x4f8) from [<c00aa014>] (hrtimer_start+0x20/0x28) <4>[ 42.489247] [<c00aa014>] (hrtimer_start+0x20/0x28) from [<c00e6190>] (rcu_idle_enter_common+0x1ac/0x320) <4>[ 42.498709] [<c00e6190>] (rcu_idle_enter_common+0x1ac/0x320) from [<c00e6440>] (rcu_idle_enter+0xa0/0xb8) <4>[ 42.508259] [<c00e6440>] (rcu_idle_enter+0xa0/0xb8) from [<c000f268>] (cpu_idle+0x24/0xf0) <4>[ 42.516503] [<c000f268>] (cpu_idle+0x24/0xf0) from [<c06ed3c0>] (rest_init+0x88/0xa0) <4>[ 42.524319] [<c06ed3c0>] (rest_init+0x88/0xa0) from [<c0c00978>] (start_kernel+0x3d0/0x434) As an example, this particular crash occurred when hrtimer_start() was executed on CPU #0. The code locked the hrtimer's current cpu_base corresponding to CPU #1. CPU #0 then tried to switch the hrtimer's cpu_base to an optimal CPU which was online. In this case, it selected the cpu_base corresponding to CPU #3. Before it could proceed, CPU #1 came online and reinitialized the spinlock corresponding to its cpu_base. Thus now CPU #0 held a lock which was reinitialized. When CPU #0 finally ended up unlocking the old cpu_base corresponding to CPU #1 so that it could switch to CPU #3, we hit this SPIN_BUG() above while in switch_hrtimer_base(). CPU #0 CPU #1 ---- ---- ... <offline> hrtimer_start() lock_hrtimer_base(base #1) ... init_hrtimers_cpu() switch_hrtimer_base() ... ... raw_spin_lock_init(&cpu_base->lock) raw_spin_unlock(&cpu_base->lock) ... <spin_bug> Solve this by statically initializing the lock. Signed-off-by: Michael Bohan <mbohan@codeaurora.org> Link: http://lkml.kernel.org/r/1363745965-23475-1-git-send-email-mbohan@codeaurora.org Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- kernel/hrtimer.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3feb..14be27feda49 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -63,6 +63,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { @@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; timerqueue_init_head(&cpu_base->clock_base[i].active); -- cgit v1.2.3 From 3151527ee007b73a0ebd296010f1c0454a919c7d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Fri, 15 Mar 2013 01:45:51 -0700 Subject: userns: Don't allow creation if the user is chrooted Guarantee that the policy of which files may be access that is established by setting the root directory will not be violated by user namespaces by verifying that the root directory points to the root of the mount namespace at the time of user namespace creation. Changing the root is a privileged operation, and as a matter of policy it serves to limit unprivileged processes to files below the current root directory. For reasons of simplicity and comprehensibility the privilege to change the root directory is gated solely on the CAP_SYS_CHROOT capability in the user namespace. Therefore when creating a user namespace we must ensure that the policy of which files may be access can not be violated by changing the root directory. Anyone who runs a processes in a chroot and would like to use user namespace can setup the same view of filesystems with a mount namespace instead. With this result that this is not a practical limitation for using user namespaces. Cc: stable@vger.kernel.org Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Reported-by: Andy Lutomirski <luto@amacapital.net> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> --- fs/namespace.c | 24 ++++++++++++++++++++++++ include/linux/fs_struct.h | 2 ++ kernel/user_namespace.c | 9 +++++++++ 3 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 50ca17d3cb45..a3035223d421 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2732,6 +2732,30 @@ bool our_mnt(struct vfsmount *mnt) return check_mnt(real_mount(mnt)); } +bool current_chrooted(void) +{ + /* Does the current process have a non-standard root */ + struct path ns_root; + struct path fs_root; + bool chrooted; + + /* Find the namespace root */ + ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; + ns_root.dentry = ns_root.mnt->mnt_root; + path_get(&ns_root); + while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) + ; + + get_fs_root(current->fs, &fs_root); + + chrooted = !path_equal(&fs_root, &ns_root); + + path_put(&fs_root); + path_put(&ns_root); + + return chrooted; +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index 729eded4b24f..2b93a9a5a1e6 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -50,4 +50,6 @@ static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, spin_unlock(&fs->lock); } +extern bool current_chrooted(void); + #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b14f4d342043..0f1e42884577 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -61,6 +61,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. -- cgit v1.2.3 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae03..d581e45c0a9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..9c7fab1d23f0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> +#include <linux/user_namespace.h> #include <linux/mount.h> #include <linux/pid_namespace.h> #include <linux/parser.h> @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include <linux/module.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/user_namespace.h> #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce009324933..b6b215f13b45 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..8e635a18ab52 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e42884577..a54f26f82eb2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3 From dbf520a9d7d4d5ba28d2947be11e34099a5e3e20 Mon Sep 17 00:00:00 2001 From: Paul Walmsley <paul@pwsan.com> Date: Sun, 31 Mar 2013 00:04:40 +0000 Subject: Revert "lockdep: check that no locks held at freeze time" This reverts commit 6aa9707099c4b25700940eb3d016f16c4434360d. Commit 6aa9707099c4 ("lockdep: check that no locks held at freeze time") causes problems with NFS root filesystems. The failures were noticed on OMAP2 and 3 boards during kernel init: [ BUG: swapper/0/1 still has locks held! ] 3.9.0-rc3-00344-ga937536 #1 Not tainted ------------------------------------- 1 lock held by swapper/0/1: #0: (&type->s_umount_key#13/1){+.+.+.}, at: [<c011e84c>] sget+0x248/0x574 stack backtrace: rpc_wait_bit_killable __wait_on_bit out_of_line_wait_on_bit __rpc_execute rpc_run_task rpc_call_sync nfs_proc_get_root nfs_get_root nfs_fs_mount_common nfs_try_mount nfs_fs_mount mount_fs vfs_kern_mount do_mount sys_mount do_mount_root mount_root prepare_namespace kernel_init_freeable kernel_init Although the rootfs mounts, the system is unstable. Here's a transcript from a PM test: http://www.pwsan.com/omap/testlogs/test_v3.9-rc3/20130317194234/pm/37xxevm/37xxevm_log.txt Here's what the test log should look like: http://www.pwsan.com/omap/testlogs/test_v3.8/20130218214403/pm/37xxevm/37xxevm_log.txt Mailing list discussion is here: http://lkml.org/lkml/2013/3/4/221 Deal with this for v3.9 by reverting the problem commit, until folks can figure out the right long-term course of action. Signed-off-by: Paul Walmsley <paul@pwsan.com> Cc: Mandeep Singh Baines <msb@chromium.org> Cc: Jeff Layton <jlayton@redhat.com> Cc: Shawn Guo <shawn.guo@linaro.org> Cc: <maciej.rutecki@gmail.com> Cc: Fengguang Wu <fengguang.wu@intel.com> Cc: Trond Myklebust <Trond.Myklebust@netapp.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Ben Chan <benchan@chromium.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Tejun Heo <tj@kernel.org> Cc: Rafael J. Wysocki <rjw@sisk.pl> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/debug_locks.h | 4 ++-- include/linux/freezer.h | 3 --- kernel/exit.c | 2 +- kernel/lockdep.c | 17 +++++++++-------- 4 files changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index a975de1ff59f..3bd46f766751 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -51,7 +51,7 @@ struct task_struct; extern void debug_show_all_locks(void); extern void debug_show_held_locks(struct task_struct *task); extern void debug_check_no_locks_freed(const void *from, unsigned long len); -extern void debug_check_no_locks_held(void); +extern void debug_check_no_locks_held(struct task_struct *task); #else static inline void debug_show_all_locks(void) { @@ -67,7 +67,7 @@ debug_check_no_locks_freed(const void *from, unsigned long len) } static inline void -debug_check_no_locks_held(void) +debug_check_no_locks_held(struct task_struct *task) { } #endif diff --git a/include/linux/freezer.h b/include/linux/freezer.h index 043a5cf8b5ba..e70df40d84f6 100644 --- a/include/linux/freezer.h +++ b/include/linux/freezer.h @@ -3,7 +3,6 @@ #ifndef FREEZER_H_INCLUDED #define FREEZER_H_INCLUDED -#include <linux/debug_locks.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/atomic.h> @@ -49,8 +48,6 @@ extern void thaw_kernel_threads(void); static inline bool try_to_freeze(void) { - if (!(current->flags & PF_NOFREEZE)) - debug_check_no_locks_held(); might_sleep(); if (likely(!freezing(current))) return false; diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca9935..60bc027c61c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(); + debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d9..8a0efac4f99d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4088,7 +4088,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; @@ -4097,21 +4097,22 @@ static void print_held_locks_bug(void) printk("\n"); printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); + printk("[ BUG: lock held at task exit time! ]\n"); print_kernel_ident(); printk("-------------------------------------\n"); - lockdep_print_held_locks(current); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task) { - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { -- cgit v1.2.3 From c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen <ak@linux.intel.com> Date: Thu, 24 Jan 2013 16:10:28 +0100 Subject: perf/core: Add weighted samples For some events it's useful to weight sample with a hardware provided number. This expresses how expensive the action the sample represent was. This allows the profiler to scale the samples to be more informative to the programmer. There is already the period which is used similarly, but it means something different, so I chose to not overload it. Instead a new sample type for WEIGHT is added. Can be used for multiple things. Initially it is used for TSX abort costs and profiling by memory latencies (so to make expensive load appear higher up in the histograms). The concept is quite generic and can be extended to many other kinds of events or architectures, as long as the hardware provides suitable auxillary values. In principle it could be also used for software tracepoints. This adds the generic glue. A new optional sample format for a 64-bit weight value. Signed-off-by: Andi Kleen <ak@linux.intel.com> Signed-off-by: Stephane Eranian <eranian@google.com> Cc: peterz@infradead.org Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-5-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 6 +++++- kernel/events/core.c | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cd3bb2cd9494..7ce0b37b155b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -573,6 +573,7 @@ struct perf_sample_data { struct perf_branch_stack *br_stack; struct perf_regs_user regs_user; u64 stack_user_size; + u64 weight; }; static inline void perf_sample_data_init(struct perf_sample_data *data, @@ -586,6 +587,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE; data->regs_user.regs = NULL; data->stack_user_size = 0; + data->weight = 0; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9fa9c622a7f4..cdc255da02e2 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -132,8 +132,10 @@ enum perf_event_sample_format { PERF_SAMPLE_BRANCH_STACK = 1U << 11, PERF_SAMPLE_REGS_USER = 1U << 12, PERF_SAMPLE_STACK_USER = 1U << 13, + PERF_SAMPLE_WEIGHT = 1U << 14, + + PERF_SAMPLE_MAX = 1U << 15, /* non-ABI */ - PERF_SAMPLE_MAX = 1U << 14, /* non-ABI */ }; /* @@ -588,6 +590,8 @@ enum perf_event_type { * { u64 size; * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * + * { u64 weight; } && PERF_SAMPLE_WEIGHT * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7b4a55d41efc..9e3edb272b3e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -976,6 +976,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); + if (sample_type & PERF_SAMPLE_WEIGHT) + size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -4193,6 +4196,9 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + + if (sample_type & PERF_SAMPLE_WEIGHT) + perf_output_put(handle, data->weight); } void perf_prepare_sample(struct perf_event_header *header, -- cgit v1.2.3 From d6be9ad6c960f43800a6f118932bc8a5a4eadcd1 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Thu, 24 Jan 2013 16:10:31 +0100 Subject: perf: Add generic memory sampling interface This patch adds PERF_SAMPLE_DATA_SRC. PERF_SAMPLE_DATA_SRC collects the data source, i.e., where did the data associated with the sampled instruction come from. Information is stored in a perf_mem_data_src structure. It contains opcode, mem level, tlb, snoop, lock information, subject to availability in hardware. Signed-off-by: Stephane Eranian <eranian@google.com> Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-8-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 68 +++++++++++++++++++++++++++++++++++++++-- kernel/events/core.c | 6 ++++ 3 files changed, 74 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7ce0b37b155b..42a6daaf4e0a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -568,6 +568,7 @@ struct perf_sample_data { u32 reserved; } cpu_entry; u64 period; + union perf_mem_data_src data_src; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; struct perf_branch_stack *br_stack; @@ -588,6 +589,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->regs_user.regs = NULL; data->stack_user_size = 0; data->weight = 0; + data->data_src.val = 0; } extern void perf_output_sample(struct perf_output_handle *handle, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index cdc255da02e2..5b5762006855 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -133,9 +133,9 @@ enum perf_event_sample_format { PERF_SAMPLE_REGS_USER = 1U << 12, PERF_SAMPLE_STACK_USER = 1U << 13, PERF_SAMPLE_WEIGHT = 1U << 14, + PERF_SAMPLE_DATA_SRC = 1U << 15, - PERF_SAMPLE_MAX = 1U << 15, /* non-ABI */ - + PERF_SAMPLE_MAX = 1U << 16, /* non-ABI */ }; /* @@ -592,6 +592,7 @@ enum perf_event_type { * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * }; */ PERF_RECORD_SAMPLE = 9, @@ -617,4 +618,67 @@ enum perf_callchain_context { #define PERF_FLAG_FD_OUTPUT (1U << 1) #define PERF_FLAG_PID_CGROUP (1U << 2) /* pid=cgroup id, per-cpu mode only */ +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_rsvd:31; + }; +}; + +/* type of opcode (load/store/prefetch,code) */ +#define PERF_MEM_OP_NA 0x01 /* not available */ +#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ +#define PERF_MEM_OP_STORE 0x04 /* store instruction */ +#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ +#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ +#define PERF_MEM_OP_SHIFT 0 + +/* memory hierarchy (memory level, hit or miss) */ +#define PERF_MEM_LVL_NA 0x01 /* not available */ +#define PERF_MEM_LVL_HIT 0x02 /* hit level */ +#define PERF_MEM_LVL_MISS 0x04 /* miss level */ +#define PERF_MEM_LVL_L1 0x08 /* L1 */ +#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x20 /* L2 hit */ +#define PERF_MEM_LVL_L3 0x40 /* L3 hit */ +#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +/* snoop mode */ +#define PERF_MEM_SNOOP_NA 0x01 /* not available */ +#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ +#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +/* locked instruction */ +#define PERF_MEM_LOCK_NA 0x01 /* not available */ +#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 + +/* TLB access */ +#define PERF_MEM_TLB_NA 0x01 /* not available */ +#define PERF_MEM_TLB_HIT 0x02 /* hit level */ +#define PERF_MEM_TLB_MISS 0x04 /* miss level */ +#define PERF_MEM_TLB_L1 0x08 /* L1 */ +#define PERF_MEM_TLB_L2 0x10 /* L2 */ +#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 + +#define PERF_MEM_S(a, s) \ + (((u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) + #endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 9e3edb272b3e..77c96d18c23a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -982,6 +982,9 @@ static void perf_event__header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_READ) size += event->read_size; + if (sample_type & PERF_SAMPLE_DATA_SRC) + size += sizeof(data->data_src.val); + event->header_size = size; } @@ -4199,6 +4202,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_WEIGHT) perf_output_put(handle, data->weight); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + perf_output_put(handle, data->data_src.val); } void perf_prepare_sample(struct perf_event_header *header, -- cgit v1.2.3 From 2fe85427e3bf65d791700d065132772fc26e4d75 Mon Sep 17 00:00:00 2001 From: Stephane Eranian <eranian@google.com> Date: Thu, 24 Jan 2013 16:10:39 +0100 Subject: perf: Add PERF_RECORD_MISC_MMAP_DATA to RECORD_MMAP Type of mapping was lost and made it hard for a tool to distinguish code vs. data mmaps. Perf has the ability to distinguish the two. Use a bit in the header->misc bitmask to keep track of the mmap type. If PERF_RECORD_MISC_MMAP_DATA is set then the mapping is not executable (!VM_EXEC). If not set, then the mapping is executable. Signed-off-by: Stephane Eranian <eranian@google.com> Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-16-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar <mingo@kernel.org> Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com> --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 5b5762006855..964a450a6e2c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -445,6 +445,7 @@ struct perf_event_mmap_page { #define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) #define PERF_RECORD_MISC_GUEST_USER (5 << 0) +#define PERF_RECORD_MISC_MMAP_DATA (1 << 13) /* * Indicates that the content of PERF_SAMPLE_IP points to * the actual instruction that triggered the event. See also diff --git a/kernel/events/core.c b/kernel/events/core.c index 77c96d18c23a..98c0845fcd20 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4791,6 +4791,9 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + if (!(vma->vm_flags & VM_EXEC)) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; rcu_read_lock(); -- cgit v1.2.3 From bc0caf099d9df4dd0fad24992b043b40541f4200 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:31 -0700 Subject: workqueue: fix race condition in unbound workqueue free path 8864b4e59 ("workqueue: implement get/put_pwq()") implemented pwq (pool_workqueue) refcnting which frees workqueue when the last pwq goes away. It determined whether it was the last pwq by testing wq->pwqs is empty. Unfortunately, the test was done outside wq->mutex and multiple pwq release could race and try to free wq multiple times leading to oops. Test wq->pwqs emptiness while holding wq->mutex. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 04a8b98d30ce..4d344326ae97 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3534,6 +3534,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) unbound_release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; + bool is_last; if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) return; @@ -3545,6 +3546,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) */ mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); put_unbound_pool(pool); @@ -3554,7 +3556,7 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Free it. */ - if (list_empty(&wq->pwqs)) + if (is_last) kfree(wq); } -- cgit v1.2.3 From 13e2e556013a543eebd238d1c2759195e3c0c9fc Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:31 -0700 Subject: workqueue: fix unbound workqueue attrs hashing / comparison 29c91e9912b ("workqueue: implement attribute-based unbound worker_pool management") implemented attrs based worker_pool matching. It tried to avoid false negative when comparing cpumasks with custom hash function; unfortunately, the hash and comparison functions fail to ignore CPUs which are not possible. It incorrectly assumed that bitmap_copy() skips leftover bits in the last word of bitmap and cpumask_equal() ignores impossible CPUs. This patch updates attrs->cpumask handling such that impossible CPUs are properly ignored. * Hash and copy functions no longer do anything special. They expect their callers to clear impossible CPUs. * alloc_workqueue_attrs() initializes the cpumask to cpu_possible_mask instead of setting all bits and explicit cpumask_setall() for unbound_std_wq_attrs[] in init_workqueues() is dropped. * apply_workqueue_attrs() is now responsible for ignoring impossible CPUs. It makes a copy of @attrs and clears impossible CPUs before doing anything else. Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 54 ++++++++++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4d344326ae97..abe1f0da4600 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3302,7 +3302,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) goto fail; - cpumask_setall(attrs->cpumask); + cpumask_copy(attrs->cpumask, cpu_possible_mask); return attrs; fail: free_workqueue_attrs(attrs); @@ -3316,33 +3316,14 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to, cpumask_copy(to->cpumask, from->cpumask); } -/* - * Hacky implementation of jhash of bitmaps which only considers the - * specified number of bits. We probably want a proper implementation in - * include/linux/jhash.h. - */ -static u32 jhash_bitmap(const unsigned long *bitmap, int bits, u32 hash) -{ - int nr_longs = bits / BITS_PER_LONG; - int nr_leftover = bits % BITS_PER_LONG; - unsigned long leftover = 0; - - if (nr_longs) - hash = jhash(bitmap, nr_longs * sizeof(long), hash); - if (nr_leftover) { - bitmap_copy(&leftover, bitmap + nr_longs, nr_leftover); - hash = jhash(&leftover, sizeof(long), hash); - } - return hash; -} - /* hash value of the content of @attr */ static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { u32 hash = 0; hash = jhash_1word(attrs->nice, hash); - hash = jhash_bitmap(cpumask_bits(attrs->cpumask), nr_cpu_ids, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); return hash; } @@ -3652,7 +3633,8 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { - struct pool_workqueue *pwq, *last_pwq; + struct workqueue_attrs *new_attrs; + struct pool_workqueue *pwq = NULL, *last_pwq; struct worker_pool *pool; /* only unbound workqueues can change attributes */ @@ -3663,15 +3645,21 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; + /* make a copy of @attrs and sanitize it */ + new_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!new_attrs) + goto enomem; + + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); if (!pwq) - return -ENOMEM; + goto enomem; - pool = get_unbound_pool(attrs); - if (!pool) { - kmem_cache_free(pwq_cache, pwq); - return -ENOMEM; - } + pool = get_unbound_pool(new_attrs); + if (!pool) + goto enomem; init_and_link_pwq(pwq, wq, pool, &last_pwq); if (last_pwq) { @@ -3681,6 +3669,11 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, } return 0; + +enomem: + kmem_cache_free(pwq_cache, pwq); + free_workqueue_attrs(new_attrs); + return -ENOMEM; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) @@ -4450,10 +4443,7 @@ static int __init init_workqueues(void) struct workqueue_attrs *attrs; BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); - attrs->nice = std_nice[i]; - cpumask_setall(attrs->cpumask); - unbound_std_wq_attrs[i] = attrs; } -- cgit v1.2.3 From 4862125b0256a946d2749a1d5003b0604bc3cb4d Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:31 -0700 Subject: workqueue: fix memory leak in apply_workqueue_attrs() apply_workqueue_attrs() wasn't freeing temp attrs variable @new_attrs in its success path. Fix it. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index abe1f0da4600..89480fc8eaa3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3636,6 +3636,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, struct workqueue_attrs *new_attrs; struct pool_workqueue *pwq = NULL, *last_pwq; struct worker_pool *pool; + int ret; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3668,12 +3669,16 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, spin_unlock_irq(&last_pwq->pool->lock); } - return 0; + ret = 0; + /* fall through */ +out_free: + free_workqueue_attrs(new_attrs); + return ret; enomem: kmem_cache_free(pwq_cache, pwq); - free_workqueue_attrs(new_attrs); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } static int alloc_and_link_pwqs(struct workqueue_struct *wq) -- cgit v1.2.3 From a892cacc7f4960a39c0fad7bbdf04c5cbf7c229e Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:32 -0700 Subject: workqueue: move pwq_pool_locking outside of get/put_unbound_pool() The scheduled NUMA affinity support for unbound workqueues would need to walk workqueues list and pool related operations on each workqueue. Move wq_pool_mutex locking out of get/put_unbound_pool() to their callers so that pool operations can be performed while walking the workqueues list, which is also protected by wq_pool_mutex. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 89480fc8eaa3..2bf3d8c6e128 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3395,31 +3395,28 @@ static void rcu_free_pool(struct rcu_head *rcu) * safe manner. get_unbound_pool() calls this function on its failure path * and this function should be able to release pools which went through, * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. */ static void put_unbound_pool(struct worker_pool *pool) { struct worker *worker; - mutex_lock(&wq_pool_mutex); - if (--pool->refcnt) { - mutex_unlock(&wq_pool_mutex); + lockdep_assert_held(&wq_pool_mutex); + + if (--pool->refcnt) return; - } /* sanity checks */ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || - WARN_ON(!list_empty(&pool->worklist))) { - mutex_unlock(&wq_pool_mutex); + WARN_ON(!list_empty(&pool->worklist))) return; - } /* release id and unhash */ if (pool->id >= 0) idr_remove(&worker_pool_idr, pool->id); hash_del(&pool->hash_node); - mutex_unlock(&wq_pool_mutex); - /* * Become the manager and destroy all workers. Grabbing * manager_arb prevents @pool's workers from blocking on @@ -3453,13 +3450,15 @@ static void put_unbound_pool(struct worker_pool *pool) * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to * create a new one. On failure, returns NULL. + * + * Should be called with wq_pool_mutex held. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; - mutex_lock(&wq_pool_mutex); + lockdep_assert_held(&wq_pool_mutex); /* do we already have a matching pool? */ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { @@ -3490,10 +3489,8 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) /* install */ hash_add(unbound_pool_hash, &pool->hash_node, hash); out_unlock: - mutex_unlock(&wq_pool_mutex); return pool; fail: - mutex_unlock(&wq_pool_mutex); if (pool) put_unbound_pool(pool); return NULL; @@ -3530,7 +3527,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); + mutex_lock(&wq_pool_mutex); put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + call_rcu_sched(&pwq->rcu, rcu_free_pwq); /* @@ -3654,13 +3654,21 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + mutex_lock(&wq_pool_mutex); + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) + if (!pwq) { + mutex_unlock(&wq_pool_mutex); goto enomem; + } pool = get_unbound_pool(new_attrs); - if (!pool) + if (!pool) { + mutex_unlock(&wq_pool_mutex); goto enomem; + } + + mutex_unlock(&wq_pool_mutex); init_and_link_pwq(pwq, wq, pool, &last_pwq); if (last_pwq) { -- cgit v1.2.3 From bce903809ab3f29eca97e0be5537778c1689c82b Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:32 -0700 Subject: workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[] Unbound workqueues are going to be NUMA-affine. Add wq_numa_tbl_len and wq_numa_possible_cpumask[] in preparation. The former is the highest NUMA node ID + 1 and the latter is masks of possibles CPUs for each NUMA node. This patch only introduces these. Future patches will make use of them. v2: NUMA initialization move into wq_numa_init(). Also, the possible cpumask array is not created if there aren't multiple nodes on the system. wq_numa_enabled bool added. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2bf3d8c6e128..5ca46a2e2616 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -44,6 +44,7 @@ #include <linux/jhash.h> #include <linux/hashtable.h> #include <linux/rculist.h> +#include <linux/nodemask.h> #include "workqueue_internal.h" @@ -253,6 +254,12 @@ struct workqueue_struct { static struct kmem_cache *pwq_cache; +static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ +static cpumask_var_t *wq_numa_possible_cpumask; + /* possible CPUs of each node */ + +static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ + static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ @@ -4407,6 +4414,43 @@ out_unlock: } #endif /* CONFIG_FREEZER */ +static void __init wq_numa_init(void) +{ + cpumask_var_t *tbl; + int node, cpu; + + /* determine NUMA pwq table len - highest node id + 1 */ + for_each_node(node) + wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); + + if (num_possible_nodes() <= 1) + return; + + /* + * We want masks of possible CPUs of each node which isn't readily + * available. Build one from cpu_to_node() which should have been + * fully initialized by now. + */ + tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + BUG_ON(!tbl); + + for_each_node(node) + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); + if (WARN_ON(node == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + /* happens iff arch is bonkers, let's just proceed */ + return; + } + cpumask_set_cpu(cpu, tbl[node]); + } + + wq_numa_possible_cpumask = tbl; + wq_numa_enabled = true; +} + static int __init init_workqueues(void) { int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; @@ -4423,6 +4467,8 @@ static int __init init_workqueues(void) cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + wq_numa_init(); + /* initialize CPU pools */ for_each_possible_cpu(cpu) { struct worker_pool *pool; -- cgit v1.2.3 From e3c916a4c7f51722785d34d9f9802b70dac3ce93 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:32 -0700 Subject: workqueue: drop 'H' from kworker names of unbound worker pools Currently, all workqueue workers which have negative nice value has 'H' postfixed to their names. This is necessary for per-cpu workers as they use the CPU number instead of pool->id to identify the pool and the 'H' postfix is the only thing distinguishing normal and highpri workers. As workers for unbound pools use pool->id, the 'H' postfix is purely informational. TASK_COMM_LEN is 16 and after the static part and delimiters, there are only five characters left for the pool and worker IDs. We're expecting to have more unbound pools with the scheduled NUMA awareness support. Let's drop the non-essential 'H' postfix from unbound kworker name. While at it, restructure kthread_create*() invocation to help future NUMA related changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5ca46a2e2616..248d18aa2a5d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1644,9 +1644,10 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = pool->attrs->nice < 0 ? "H" : ""; struct worker *worker = NULL; + int node = pool->cpu >= 0 ? cpu_to_node(pool->cpu) : NUMA_NO_NODE; int id = -1; + char id_buf[16]; lockdep_assert_held(&pool->manager_mutex); @@ -1672,13 +1673,13 @@ static struct worker *create_worker(struct worker_pool *pool) worker->id = id; if (pool->cpu >= 0) - worker->task = kthread_create_on_node(worker_thread, - worker, cpu_to_node(pool->cpu), - "kworker/%d:%d%s", pool->cpu, id, pri); + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, + pool->attrs->nice < 0 ? "H" : ""); else - worker->task = kthread_create(worker_thread, worker, - "kworker/u%d:%d%s", - pool->id, id, pri); + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + + worker->task = kthread_create_on_node(worker_thread, worker, node, + "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; -- cgit v1.2.3 From f3f90ad46934202eeefac454fd5d89bf73c6aa34 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:34 -0700 Subject: workqueue: determine NUMA node of workers accourding to the allowed cpumask When worker tasks are created using kthread_create_on_node(), currently only per-cpu ones have the matching NUMA node specified. All unbound workers are always created with NUMA_NO_NODE. Now that an unbound worker pool may have an arbitrary cpumask associated with it, this isn't optimal. Add pool->node which is determined by the pool's cpumask. If the pool's cpumask is contained inside a NUMA node proper, the pool is associated with that node, and all workers of the pool are created on that node. This currently only makes difference for unbound worker pools with cpumask contained inside single NUMA node, but this will serve as foundation for making all unbound pools NUMA-affine. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 248d18aa2a5d..3e18c7b865eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -138,6 +138,7 @@ enum { struct worker_pool { spinlock_t lock; /* the pool lock */ int cpu; /* I: the associated cpu */ + int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -1645,7 +1646,6 @@ static struct worker *alloc_worker(void) static struct worker *create_worker(struct worker_pool *pool) { struct worker *worker = NULL; - int node = pool->cpu >= 0 ? cpu_to_node(pool->cpu) : NUMA_NO_NODE; int id = -1; char id_buf[16]; @@ -1678,7 +1678,7 @@ static struct worker *create_worker(struct worker_pool *pool) else snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); - worker->task = kthread_create_on_node(worker_thread, worker, node, + worker->task = kthread_create_on_node(worker_thread, worker, pool->node, "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; @@ -3360,6 +3360,7 @@ static int init_worker_pool(struct worker_pool *pool) spin_lock_init(&pool->lock); pool->id = -1; pool->cpu = -1; + pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); @@ -3465,6 +3466,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { u32 hash = wqattrs_hash(attrs); struct worker_pool *pool; + int node; lockdep_assert_held(&wq_pool_mutex); @@ -3487,6 +3489,17 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; + } + } + } + if (worker_pool_assign_id(pool) < 0) goto fail; @@ -4480,6 +4493,7 @@ static int __init init_workqueues(void) pool->cpu = cpu; cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); pool->attrs->nice = std_nice[i++]; + pool->node = cpu_to_node(cpu); /* alloc pool ID */ mutex_lock(&wq_pool_mutex); -- cgit v1.2.3 From 6029a91829ad2bd876fed78bc088d3469a9dd777 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:34 -0700 Subject: workqueue: add workqueue->unbound_attrs Currently, when exposing attrs of an unbound workqueue via sysfs, the workqueue_attrs of first_pwq() is used as that should equal the current state of the workqueue. The planned NUMA affinity support will make unbound workqueues make use of multiple pool_workqueues for different NUMA nodes and the above assumption will no longer hold. Introduce workqueue->unbound_attrs which records the current attrs in effect and use it for sysfs instead of first_pwq()->attrs. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3e18c7b865eb..32b474463053 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -244,6 +244,8 @@ struct workqueue_struct { int nr_drainers; /* WQ: drain in progress */ int saved_max_active; /* WQ: saved pwq max_active */ + struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ + #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ #endif @@ -3088,10 +3090,9 @@ static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, struct workqueue_struct *wq = dev_to_wq(dev); int written; - rcu_read_lock_sched(); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - first_pwq(wq)->pool->attrs->nice); - rcu_read_unlock_sched(); + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); return written; } @@ -3105,9 +3106,9 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) if (!attrs) return NULL; - rcu_read_lock_sched(); - copy_workqueue_attrs(attrs, first_pwq(wq)->pool->attrs); - rcu_read_unlock_sched(); + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); return attrs; } @@ -3138,10 +3139,9 @@ static ssize_t wq_cpumask_show(struct device *dev, struct workqueue_struct *wq = dev_to_wq(dev); int written; - rcu_read_lock_sched(); - written = cpumask_scnprintf(buf, PAGE_SIZE, - first_pwq(wq)->pool->attrs->cpumask); - rcu_read_unlock_sched(); + mutex_lock(&wq->mutex); + written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); + mutex_unlock(&wq->mutex); written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); return written; @@ -3558,8 +3558,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) * If we're the last pwq going away, @wq is already dead and no one * is gonna access it anymore. Free it. */ - if (is_last) + if (is_last) { + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); + } } /** @@ -3634,6 +3636,9 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); + if (wq->flags & WQ_UNBOUND) + copy_workqueue_attrs(wq->unbound_attrs, pool->attrs); + mutex_unlock(&wq->mutex); } @@ -3766,6 +3771,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (!wq) return NULL; + if (flags & WQ_UNBOUND) { + wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!wq->unbound_attrs) + goto err_free_wq; + } + vsnprintf(wq->name, namelen, fmt, args1); va_end(args); va_end(args1); @@ -3835,6 +3846,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, return wq; err_free_wq: + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); return NULL; err_destroy: -- cgit v1.2.3 From ecf6881ff349ad8670ec53a7586002d20b5f3b2e Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:34 -0700 Subject: workqueue: make workqueue->name[] fixed len Currently workqueue->name[] is of flexible length. We want to use the flexible field for something more useful and there isn't much benefit in allowing arbitrary name length anyway. Make it fixed len capping at 24 bytes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 32b474463053..c8c5838c52c9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -101,6 +101,8 @@ enum { */ RESCUER_NICE_LEVEL = -20, HIGHPRI_NICE_LEVEL = -20, + + WQ_NAME_LEN = 24, }; /* @@ -252,7 +254,7 @@ struct workqueue_struct { #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif - char name[]; /* I: workqueue name */ + char name[WQ_NAME_LEN]; /* I: workqueue name */ }; static struct kmem_cache *pwq_cache; @@ -3757,17 +3759,12 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct lock_class_key *key, const char *lock_name, ...) { - va_list args, args1; + va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; - size_t namelen; - - /* determine namelen, allocate wq and format name */ - va_start(args, lock_name); - va_copy(args1, args); - namelen = vsnprintf(NULL, 0, fmt, args) + 1; - wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + /* allocate wq and format name */ + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; @@ -3777,9 +3774,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, goto err_free_wq; } - vsnprintf(wq->name, namelen, fmt, args1); + va_start(args, lock_name); + vsnprintf(wq->name, sizeof(wq->name), fmt, args); va_end(args); - va_end(args1); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); -- cgit v1.2.3 From 2728fd2f098c3cc5efaf3f0433855e579d5e4f28 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: workqueue: move hot fields of workqueue_struct to the end Move wq->flags and ->cpu_pwqs to the end of workqueue_struct and align them to the cacheline. These two fields are used in the work item issue path and thus hot. The scheduled NUMA affinity support will add dispatch table at the end of workqueue_struct and relocating these two fields will allow us hitting only single cacheline on hot paths. Note that wq->pwqs isn't moved although it currently is being used in the work item issue path for unbound workqueues. The dispatch table mentioned above will replace its use in the issue path, so it will become cold once NUMA support is implemented. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8c5838c52c9..4c53fa216732 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -227,8 +227,6 @@ struct wq_device; * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwq's */ struct list_head pwqs; /* WR: all pwqs of this wq */ struct list_head list; /* PL: list of all workqueues */ @@ -255,6 +253,10 @@ struct workqueue_struct { struct lockdep_map lockdep_map; #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + + /* hot fields used during command issue, aligned to cacheline */ + unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ + struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; -- cgit v1.2.3 From df2d5ae4995b3fb9392b6089b9623d20b6c3a542 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: workqueue: map an unbound workqueues to multiple per-node pool_workqueues Currently, an unbound workqueue has only one "current" pool_workqueue associated with it. It may have multple pool_workqueues but only the first pool_workqueue servies new work items. For NUMA affinity, we want to change this so that there are multiple current pool_workqueues serving different NUMA nodes. Introduce workqueue->numa_pwq_tbl[] which is indexed by NUMA node and points to the pool_workqueue to use for each possible node. This replaces first_pwq() in __queue_work() and workqueue_congested(). numa_pwq_tbl[] is currently initialized to point to the same pool_workqueue as first_pwq() so this patch doesn't make any behavior changes. v2: Use rcu_dereference_raw() in unbound_pwq_by_node() as the function may be called only with wq->mutex held. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4c53fa216732..170226a24da8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -257,6 +257,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ + struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ }; static struct kmem_cache *pwq_cache; @@ -525,6 +526,22 @@ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) pwqs_node); } +/** + * unbound_pwq_by_node - return the unbound pool_workqueue for the given node + * @wq: the target workqueue + * @node: the node ID + * + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. + */ +static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, + int node) +{ + assert_rcu_or_wq_mutex(wq); + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1278,14 +1295,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, WARN_ON_ONCE(!is_chained_work(wq))) return; retry: + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); + /* pwq which will be used unless @work is executing elsewhere */ - if (!(wq->flags & WQ_UNBOUND)) { - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); - } else { - pwq = first_pwq(wq); - } + else + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); /* * If @work was previously on a different pool, it might still be @@ -1315,8 +1332,8 @@ retry: * pwq is determined and locked. For unbound pools, we could have * raced with pwq release and it could already be dead. If its * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it as the first pwq or while a - * work item is executing on it, so the retying is guaranteed to + * without another pwq replacing it in the numa_pwq_tbl or while + * work items are executing on it, so the retrying is guaranteed to * make forward-progress. */ if (unlikely(!pwq->refcnt)) { @@ -3614,6 +3631,8 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, struct worker_pool *pool, struct pool_workqueue **p_last_pwq) { + int node; + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); pwq->pool = pool; @@ -3640,8 +3659,11 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); - if (wq->flags & WQ_UNBOUND) + if (wq->flags & WQ_UNBOUND) { copy_workqueue_attrs(wq->unbound_attrs, pool->attrs); + for_each_node(node) + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + } mutex_unlock(&wq->mutex); } @@ -3761,12 +3783,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct lock_class_key *key, const char *lock_name, ...) { + size_t tbl_size = 0; va_list args; struct workqueue_struct *wq; struct pool_workqueue *pwq; /* allocate wq and format name */ - wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (flags & WQ_UNBOUND) + tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); + + wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) return NULL; @@ -3994,7 +4020,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) if (!(wq->flags & WQ_UNBOUND)) pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); else - pwq = first_pwq(wq); + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); ret = !list_empty(&pwq->delayed_works); rcu_read_unlock_sched(); -- cgit v1.2.3 From f147f29eb7c4959e5f8be604ce2d23979c86378c Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: workqueue: break init_and_link_pwq() into two functions and introduce alloc_unbound_pwq() Break init_and_link_pwq() into init_pwq() and link_pwq() and move unbound-workqueue specific handling into apply_workqueue_attrs(). Also, factor out unbound pool and pool_workqueue allocation into alloc_unbound_pwq(). This reorganization is to prepare for NUMA affinity and doesn't introduce any functional changes. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 78 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 170226a24da8..c8d047b6c895 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3626,13 +3626,10 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_unlock_irq(&pwq->pool->lock); } -static void init_and_link_pwq(struct pool_workqueue *pwq, - struct workqueue_struct *wq, - struct worker_pool *pool, - struct pool_workqueue **p_last_pwq) +/* initialize newly zalloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, + struct worker_pool *pool) { - int node; - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); pwq->pool = pool; @@ -3642,8 +3639,15 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, INIT_LIST_HEAD(&pwq->delayed_works); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); +} - mutex_lock(&wq->mutex); +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq, + struct pool_workqueue **p_last_pwq) +{ + struct workqueue_struct *wq = pwq->wq; + + lockdep_assert_held(&wq->mutex); /* * Set the matching work_color. This is synchronized with @@ -3658,14 +3662,29 @@ static void init_and_link_pwq(struct pool_workqueue *pwq, /* link in @pwq */ list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} - if (wq->flags & WQ_UNBOUND) { - copy_workqueue_attrs(wq->unbound_attrs, pool->attrs); - for_each_node(node) - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct worker_pool *pool; + struct pool_workqueue *pwq; + + lockdep_assert_held(&wq_pool_mutex); + + pool = get_unbound_pool(attrs); + if (!pool) + return NULL; + + pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + if (!pwq) { + put_unbound_pool(pool); + return NULL; } - mutex_unlock(&wq->mutex); + init_pwq(pwq, wq, pool); + return pwq; } /** @@ -3686,9 +3705,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct workqueue_attrs *new_attrs; - struct pool_workqueue *pwq = NULL, *last_pwq; - struct worker_pool *pool; - int ret; + struct pool_workqueue *pwq, *last_pwq; + int node, ret; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3707,22 +3725,21 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); mutex_lock(&wq_pool_mutex); - - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); - if (!pwq) { - mutex_unlock(&wq_pool_mutex); + pwq = alloc_unbound_pwq(wq, new_attrs); + mutex_unlock(&wq_pool_mutex); + if (!pwq) goto enomem; - } - pool = get_unbound_pool(new_attrs); - if (!pool) { - mutex_unlock(&wq_pool_mutex); - goto enomem; - } + mutex_lock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); + link_pwq(pwq, &last_pwq); + + copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + for_each_node(node) + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + + mutex_unlock(&wq->mutex); - init_and_link_pwq(pwq, wq, pool, &last_pwq); if (last_pwq) { spin_lock_irq(&last_pwq->pool->lock); put_pwq(last_pwq); @@ -3736,7 +3753,6 @@ out_free: return ret; enomem: - kmem_cache_free(pwq_cache, pwq); ret = -ENOMEM; goto out_free; } @@ -3757,7 +3773,11 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); - init_and_link_pwq(pwq, wq, &cpu_pools[highpri], NULL); + init_pwq(pwq, wq, &cpu_pools[highpri]); + + mutex_lock(&wq->mutex); + link_pwq(pwq, NULL); + mutex_unlock(&wq->mutex); } return 0; } else { -- cgit v1.2.3 From e50aba9aea63b7617887b4d9694184f478731c82 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: workqueue: use NUMA-aware allocation for pool_workqueues Use kmem_cache_alloc_node() with @pool->node instead of kmem_cache_zalloc() when allocating a pool_workqueue so that it's allocated on the same node as the associated worker_pool. As there's no no kmem_cache_zalloc_node(), move zeroing to init_pwq(). This was suggested by Lai Jiangshan. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8d047b6c895..07ec57459457 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3626,12 +3626,14 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) spin_unlock_irq(&pwq->pool->lock); } -/* initialize newly zalloced @pwq which is associated with @wq and @pool */ +/* initialize newly alloced @pwq which is associated with @wq and @pool */ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, struct worker_pool *pool) { BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + memset(pwq, 0, sizeof(*pwq)); + pwq->pool = pool; pwq->wq = wq; pwq->flush_color = -1; @@ -3677,7 +3679,7 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, if (!pool) return NULL; - pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL); + pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); if (!pwq) { put_unbound_pool(pool); return NULL; -- cgit v1.2.3 From 1befcf3073fa083e7dc48c384ce06f3bd900f514 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: workqueue: introduce numa_pwq_tbl_install() Factor out pool_workqueue linking and installation into numa_pwq_tbl[] from apply_workqueue_attrs() into numa_pwq_tbl_install(). link_pwq() is made safe to call multiple times. numa_pwq_tbl_install() links the pwq, installs it into numa_pwq_tbl[] at the specified node and returns the old entry. @last_pwq is removed from link_pwq() as the return value of the new function can be used instead. This is to prepare for NUMA affinity support for unbound workqueues. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 07ec57459457..3825c14304e1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3639,24 +3639,26 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, pwq->flush_color = -1; pwq->refcnt = 1; INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ -static void link_pwq(struct pool_workqueue *pwq, - struct pool_workqueue **p_last_pwq) +static void link_pwq(struct pool_workqueue *pwq) { struct workqueue_struct *wq = pwq->wq; lockdep_assert_held(&wq->mutex); + /* may be called multiple times, ignore if already linked */ + if (!list_empty(&pwq->pwqs_node)) + return; + /* * Set the matching work_color. This is synchronized with * wq->mutex to avoid confusing flush_workqueue(). */ - if (p_last_pwq) - *p_last_pwq = first_pwq(wq); pwq->work_color = wq->work_color; /* sync max_active to the current setting */ @@ -3689,6 +3691,23 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } +/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ +static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, + int node, + struct pool_workqueue *pwq) +{ + struct pool_workqueue *old_pwq; + + lockdep_assert_held(&wq->mutex); + + /* link_pwq() can handle duplicate calls */ + link_pwq(pwq); + + old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + return old_pwq; +} + /** * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue * @wq: the target workqueue @@ -3707,7 +3726,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct workqueue_attrs *new_attrs; - struct pool_workqueue *pwq, *last_pwq; + struct pool_workqueue *pwq, *last_pwq = NULL; int node, ret; /* only unbound workqueues can change attributes */ @@ -3734,11 +3753,9 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, mutex_lock(&wq->mutex); - link_pwq(pwq, &last_pwq); - copy_workqueue_attrs(wq->unbound_attrs, new_attrs); for_each_node(node) - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + last_pwq = numa_pwq_tbl_install(wq, node, pwq); mutex_unlock(&wq->mutex); @@ -3778,7 +3795,7 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) init_pwq(pwq, wq, &cpu_pools[highpri]); mutex_lock(&wq->mutex); - link_pwq(pwq, NULL); + link_pwq(pwq); mutex_unlock(&wq->mutex); } return 0; -- cgit v1.2.3 From dce90d47c4288c7d3c1988bebb059ea7451d5fd5 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:35 -0700 Subject: workqueue: introduce put_pwq_unlocked() Factor out lock pool, put_pwq(), unlock sequence into put_pwq_unlocked(). The two existing places are converted and there will be more with NUMA affinity support. This is to prepare for NUMA affinity support for unbound workqueues and doesn't introduce any functional difference. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3825c14304e1..d9a4aeb844d5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1057,6 +1057,25 @@ static void put_pwq(struct pool_workqueue *pwq) schedule_work(&pwq->unbound_release_work); } +/** + * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock + * @pwq: pool_workqueue to put (can be %NULL) + * + * put_pwq() with locking. This function also allows %NULL @pwq. + */ +static void put_pwq_unlocked(struct pool_workqueue *pwq) +{ + if (pwq) { + /* + * As both pwqs and pools are sched-RCU protected, the + * following lock operations are safe. + */ + spin_lock_irq(&pwq->pool->lock); + put_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); + } +} + static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -3759,12 +3778,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, mutex_unlock(&wq->mutex); - if (last_pwq) { - spin_lock_irq(&last_pwq->pool->lock); - put_pwq(last_pwq); - spin_unlock_irq(&last_pwq->pool->lock); - } - + put_pwq_unlocked(last_pwq); ret = 0; /* fall through */ out_free: @@ -3979,16 +3993,12 @@ void destroy_workqueue(struct workqueue_struct *wq) } else { /* * We're the sole accessor of @wq at this point. Directly - * access the first pwq and put the base ref. As both pwqs - * and pools are sched-RCU protected, the lock operations - * are safe. @wq will be freed when the last pwq is - * released. + * access the first pwq and put the base ref. @wq will be + * freed when the last pwq is released. */ pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, pwqs_node); - spin_lock_irq(&pwq->pool->lock); - put_pwq(pwq); - spin_unlock_irq(&pwq->pool->lock); + put_pwq_unlocked(pwq); } } EXPORT_SYMBOL_GPL(destroy_workqueue); -- cgit v1.2.3 From 4c16bd327c74d6678858706211a0c6e4e53eb3e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:36 -0700 Subject: workqueue: implement NUMA affinity for unbound workqueues Currently, an unbound workqueue has single current, or first, pwq (pool_workqueue) to which all new work items are queued. This often isn't optimal on NUMA machines as workers may jump around across node boundaries and work items get assigned to workers without any regard to NUMA affinity. This patch implements NUMA affinity for unbound workqueues. Instead of mapping all entries of numa_pwq_tbl[] to the same pwq, apply_workqueue_attrs() now creates a separate pwq covering the intersecting CPUs for each NUMA node which has online CPUs in @attrs->cpumask. Nodes which don't have intersecting possible CPUs are mapped to pwqs covering whole @attrs->cpumask. As CPUs come up and go down, the pool association is changed accordingly. Changing pool association may involve allocating new pools which may fail. To avoid failing CPU_DOWN, each workqueue always keeps a default pwq which covers whole attrs->cpumask which is used as fallback if pool creation fails during a CPU hotplug operation. This ensures that all work items issued on a NUMA node is executed on the same node as long as the workqueue allows execution on the CPUs of the node. As this maps a workqueue to multiple pwqs and max_active is per-pwq, this change the behavior of max_active. The limit is now per NUMA node instead of global. While this is an actual change, max_active is already per-cpu for per-cpu workqueues and primarily used as safety mechanism rather than for active concurrency control. Concurrency is usually limited from workqueue users by the number of concurrently active work items and this change shouldn't matter much. v2: Fixed pwq freeing in apply_workqueue_attrs() error path. Spotted by Lai. v3: The previous version incorrectly made a workqueue spanning multiple nodes spread work items over all online CPUs when some of its nodes don't have any desired cpus. Reimplemented so that NUMA affinity is properly updated as CPUs go up and down. This problem was spotted by Lai Jiangshan. v4: destroy_workqueue() was putting wq->dfl_pwq and then clearing it; however, wq may be freed at any time after dfl_pwq is put making the clearing use-after-free. Clear wq->dfl_pwq before putting it. v5: apply_workqueue_attrs() was leaking @tmp_attrs, @new_attrs and @pwq_tbl after success. Fixed. Retry loop in wq_update_unbound_numa_attrs() isn't necessary as application of new attrs is excluded via CPU hotplug. Removed. Documentation on CPU affinity guarantee on CPU_DOWN added. All changes are suggested by Lai Jiangshan. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- kernel/workqueue.c | 278 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 259 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d9a4aeb844d5..57cd77de4a4f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -45,6 +45,7 @@ #include <linux/hashtable.h> #include <linux/rculist.h> #include <linux/nodemask.h> +#include <linux/moduleparam.h> #include "workqueue_internal.h" @@ -245,6 +246,7 @@ struct workqueue_struct { int saved_max_active; /* WQ: saved pwq max_active */ struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ + struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -268,6 +270,9 @@ static cpumask_var_t *wq_numa_possible_cpumask; static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ +/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; + static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ @@ -3710,6 +3715,61 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, return pwq; } +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&wq_pool_mutex); + + if (pwq) { + put_unbound_pool(pwq->pool); + kfree(pwq); + } +} + +/** + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask + * + * Calculate the cpumask a workqueue with @attrs should use on @node. If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation. The result is stored in @cpumask. This function returns + * %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. + * + * If NUMA affinity is not enabled, @attrs->cpumask is always used. If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. + * + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + */ +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, + int cpu_going_down, cpumask_t *cpumask) +{ + if (!wq_numa_enabled) + goto use_dfl; + + /* does @node have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + if (cpu_going_down >= 0) + cpumask_clear_cpu(cpu_going_down, cpumask); + + if (cpumask_empty(cpumask)) + goto use_dfl; + + /* yeap, return possible CPUs in @node that @attrs wants */ + cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + return !cpumask_equal(cpumask, attrs->cpumask); + +use_dfl: + cpumask_copy(cpumask, attrs->cpumask); + return false; +} + /* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, int node, @@ -3732,11 +3792,12 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, * @wq: the target workqueue * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() * - * Apply @attrs to an unbound workqueue @wq. If @attrs doesn't match the - * current attributes, a new pwq is created and made the first pwq which - * will serve all new work items. Older pwqs are released as in-flight - * work items finish. Note that a work item which repeatedly requeues - * itself back-to-back will stay on its current pwq. + * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on. Older pwqs are released as in-flight work + * items finish. Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. * * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on * failure. @@ -3744,8 +3805,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { - struct workqueue_attrs *new_attrs; - struct pool_workqueue *pwq, *last_pwq = NULL; + struct workqueue_attrs *new_attrs, *tmp_attrs; + struct pool_workqueue **pwq_tbl, *dfl_pwq; int node, ret; /* only unbound workqueues can change attributes */ @@ -3756,40 +3817,191 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) return -EINVAL; - /* make a copy of @attrs and sanitize it */ + pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!new_attrs) + tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pwq_tbl || !new_attrs || !tmp_attrs) goto enomem; + /* make a copy of @attrs and sanitize it */ copy_workqueue_attrs(new_attrs, attrs); cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + /* + * We may create multiple pwqs with differing cpumasks. Make a + * copy of @new_attrs which will be modified and used to obtain + * pools. + */ + copy_workqueue_attrs(tmp_attrs, new_attrs); + + /* + * CPUs should stay stable across pwq creations and installations. + * Pin CPUs, determine the target cpumask for each node and create + * pwqs accordingly. + */ + get_online_cpus(); + mutex_lock(&wq_pool_mutex); - pwq = alloc_unbound_pwq(wq, new_attrs); + + /* + * If something goes wrong during CPU up/down, we'll fall back to + * the default pwq covering whole @attrs->cpumask. Always create + * it even if we don't use it immediately. + */ + dfl_pwq = alloc_unbound_pwq(wq, new_attrs); + if (!dfl_pwq) + goto enomem_pwq; + + for_each_node(node) { + if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { + pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); + if (!pwq_tbl[node]) + goto enomem_pwq; + } else { + dfl_pwq->refcnt++; + pwq_tbl[node] = dfl_pwq; + } + } + mutex_unlock(&wq_pool_mutex); - if (!pwq) - goto enomem; + /* all pwqs have been created successfully, let's install'em */ mutex_lock(&wq->mutex); copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + + /* save the previous pwq and install the new one */ for_each_node(node) - last_pwq = numa_pwq_tbl_install(wq, node, pwq); + pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + + /* @dfl_pwq might not have been used, ensure it's linked */ + link_pwq(dfl_pwq); + swap(wq->dfl_pwq, dfl_pwq); mutex_unlock(&wq->mutex); - put_pwq_unlocked(last_pwq); + /* put the old pwqs */ + for_each_node(node) + put_pwq_unlocked(pwq_tbl[node]); + put_pwq_unlocked(dfl_pwq); + + put_online_cpus(); ret = 0; /* fall through */ out_free: + free_workqueue_attrs(tmp_attrs); free_workqueue_attrs(new_attrs); + kfree(pwq_tbl); return ret; +enomem_pwq: + free_unbound_pwq(dfl_pwq); + for_each_node(node) + if (pwq_tbl && pwq_tbl[node] != dfl_pwq) + free_unbound_pwq(pwq_tbl[node]); + mutex_unlock(&wq_pool_mutex); + put_online_cpus(); enomem: ret = -ENOMEM; goto out_free; } +/** + * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * @wq: the target workqueue + * @cpu: the CPU coming up or going down + * @online: whether @cpu is coming up or going down + * + * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * @wq accordingly. + * + * If NUMA affinity can't be adjusted due to memory allocation failure, it + * falls back to @wq->dfl_pwq which may not be optimal but is always + * correct. + * + * Note that when the last allowed CPU of a NUMA node goes offline for a + * workqueue with a cpumask spanning multiple nodes, the workers which were + * already executing the work items for the workqueue will lose their CPU + * affinity and may execute on any CPU. This is similar to how per-cpu + * workqueues behave on CPU_DOWN. If a workqueue user wants strict + * affinity, it's the user's responsibility to flush the work item from + * CPU_DOWN_PREPARE. + */ +static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, + bool online) +{ + int node = cpu_to_node(cpu); + int cpu_off = online ? -1 : cpu; + struct pool_workqueue *old_pwq = NULL, *pwq; + struct workqueue_attrs *target_attrs; + cpumask_t *cpumask; + + lockdep_assert_held(&wq_pool_mutex); + + if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) + return; + + /* + * We don't wanna alloc/free wq_attrs for each wq for each CPU. + * Let's use a preallocated one. The following buf is protected by + * CPU hotplug exclusion. + */ + target_attrs = wq_update_unbound_numa_attrs_buf; + cpumask = target_attrs->cpumask; + + mutex_lock(&wq->mutex); + + copy_workqueue_attrs(target_attrs, wq->unbound_attrs); + pwq = unbound_pwq_by_node(wq, node); + + /* + * Let's determine what needs to be done. If the target cpumask is + * different from wq's, we need to compare it to @pwq's and create + * a new one if they don't match. If the target cpumask equals + * wq's, the default pwq should be used. If @pwq is already the + * default one, nothing to do; otherwise, install the default one. + */ + if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + goto out_unlock; + } else { + if (pwq == wq->dfl_pwq) + goto out_unlock; + else + goto use_dfl_pwq; + } + + mutex_unlock(&wq->mutex); + + /* create a new pwq */ + pwq = alloc_unbound_pwq(wq, target_attrs); + if (!pwq) { + pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); + goto out_unlock; + } + + /* + * Install the new pwq. As this function is called only from CPU + * hotplug callbacks and applying a new attrs is wrapped with + * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed + * inbetween. + */ + mutex_lock(&wq->mutex); + old_pwq = numa_pwq_tbl_install(wq, node, pwq); + goto out_unlock; + +use_dfl_pwq: + spin_lock_irq(&wq->dfl_pwq->pool->lock); + get_pwq(wq->dfl_pwq); + spin_unlock_irq(&wq->dfl_pwq->pool->lock); + old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); +out_unlock: + mutex_unlock(&wq->mutex); + put_pwq_unlocked(old_pwq); +} + static int alloc_and_link_pwqs(struct workqueue_struct *wq) { bool highpri = wq->flags & WQ_HIGHPRI; @@ -3942,6 +4154,7 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; + int node; /* drain it before proceeding with destruction */ drain_workqueue(wq); @@ -3993,11 +4206,21 @@ void destroy_workqueue(struct workqueue_struct *wq) } else { /* * We're the sole accessor of @wq at this point. Directly - * access the first pwq and put the base ref. @wq will be - * freed when the last pwq is released. + * access numa_pwq_tbl[] and dfl_pwq to put the base refs. + * @wq will be freed when the last pwq is released. */ - pwq = list_first_entry(&wq->pwqs, struct pool_workqueue, - pwqs_node); + for_each_node(node) { + pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); + put_pwq_unlocked(pwq); + } + + /* + * Put dfl_pwq. @wq may be freed any time after dfl_pwq is + * put. Don't access it afterwards. + */ + pwq = wq->dfl_pwq; + wq->dfl_pwq = NULL; put_pwq_unlocked(pwq); } } @@ -4285,6 +4508,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, { int cpu = (unsigned long)hcpu; struct worker_pool *pool; + struct workqueue_struct *wq; int pi; switch (action & ~CPU_TASKS_FROZEN) { @@ -4317,6 +4541,10 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, mutex_unlock(&pool->manager_mutex); } + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + mutex_unlock(&wq_pool_mutex); break; } @@ -4333,12 +4561,21 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, { int cpu = (unsigned long)hcpu; struct work_struct unbind_work; + struct workqueue_struct *wq; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - /* unbinding should happen on the local CPU */ + /* unbinding per-cpu workers should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); break; } @@ -4526,6 +4763,9 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + BUG_ON(!wq_update_unbound_numa_attrs_buf); + /* * We want masks of possible CPUs of each node which isn't readily * available. Build one from cpu_to_node() which should have been -- cgit v1.2.3 From d55262c4d164759a8debe772da6c9b16059dec47 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 1 Apr 2013 11:23:38 -0700 Subject: workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity Unbound workqueues are now NUMA aware. Let's add some control knobs and update sysfs interface accordingly. * Add kernel param workqueue.numa_disable which disables NUMA affinity globally. * Replace sysfs file "pool_id" with "pool_ids" which contain node:pool_id pairs. This change is userland-visible but "pool_id" hasn't seen a release yet, so this is okay. * Add a new sysf files "numa" which can toggle NUMA affinity on individual workqueues. This is implemented as attrs->no_numa whichn is special in that it isn't part of a pool's attributes. It only affects how apply_workqueue_attrs() picks which pools to use. After "pool_ids" change, first_pwq() doesn't have any user left. Removed. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com> --- Documentation/kernel-parameters.txt | 9 ++++ include/linux/workqueue.h | 5 +++ kernel/workqueue.c | 82 ++++++++++++++++++++++++++----------- 3 files changed, 73 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 4609e81dbc37..c75ea0b8ec59 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted. or other driver-specific files in the Documentation/watchdog/ directory. + workqueue.disable_numa + By default, all work items queued to unbound + workqueues are affine to the NUMA nodes they're + issued on, which results in better behavior in + general. If NUMA affinity needs to be disabled for + whatever reason, this option can be used. Note + that this also can be controlled per-workqueue for + workqueues visible under /sys/bus/workqueue/. + x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of default x2apic cluster mode on platforms supporting x2apic. diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 835d12b76960..717975639378 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -119,10 +119,15 @@ struct delayed_work { /* * A struct for workqueue attributes. This can be used to change * attributes of an unbound workqueue. + * + * Unlike other fields, ->no_numa isn't a property of a worker_pool. It + * only modifies how apply_workqueue_attrs() select pools and thus doesn't + * participate in pool hash calculations or equality comparisons. */ struct workqueue_attrs { int nice; /* nice level */ cpumask_var_t cpumask; /* allowed CPUs */ + bool no_numa; /* disable NUMA affinity */ }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 57cd77de4a4f..729ac6a44860 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ static cpumask_var_t *wq_numa_possible_cpumask; /* possible CPUs of each node */ +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ @@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * first_pwq - return the first pool_workqueue of the specified workqueue - * @wq: the target workqueue - * - * This must be called either with wq->mutex held or sched RCU read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - */ -static struct pool_workqueue *first_pwq(struct workqueue_struct *wq) -{ - assert_rcu_or_wq_mutex(wq); - return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue, - pwqs_node); -} - /** * unbound_pwq_by_node - return the unbound pool_workqueue for the given node * @wq: the target workqueue @@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = { __ATTR_NULL, }; -static ssize_t wq_pool_id_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); - struct worker_pool *pool; - int written; + const char *delim = ""; + int node, written = 0; rcu_read_lock_sched(); - pool = first_pwq(wq)->pool; - written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); rcu_read_unlock_sched(); return written; @@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev, return ret ?: count; } +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_id, 0444, wq_pool_id_show, NULL), + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; @@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { - if (!wq_numa_enabled) + if (!wq_numa_enabled || attrs->no_numa) goto use_dfl; /* does @node have any online CPUs @attrs wants? */ @@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, cpumask = target_attrs->cpumask; mutex_lock(&wq->mutex); + if (wq->unbound_attrs->no_numa) + goto out_unlock; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); pwq = unbound_pwq_by_node(wq, node); @@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; + if (wq_disable_numa) { + pr_info("workqueue: NUMA affinity support disabled\n"); + return; + } + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); BUG_ON(!wq_update_unbound_numa_attrs_buf); -- cgit v1.2.3 From 1e2ccd1c0f67c3f958d6139de2496787b9a57182 Mon Sep 17 00:00:00 2001 From: Kevin Wilson <wkevils@gmail.com> Date: Mon, 1 Apr 2013 10:51:37 +0300 Subject: cgroup: remove unused parameter in cgroup_task_migrate(). This patch removes unused parameter from cgroup_task_migrate(). Signed-off-by: Kevin Wilson <wkevils@gmail.com> Acked-by: Acked-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 04fa2abf94b2..4aee5bdd66c8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1911,7 +1911,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * * Must be called with cgroup_mutex and threadgroup locked. */ -static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, +static void cgroup_task_migrate(struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; @@ -2084,7 +2084,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ -- cgit v1.2.3 From f281769e81b49840f1857f6dfac049350e678350 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sun, 17 Mar 2013 18:54:44 +0100 Subject: uprobes: Use file_inode() Cleanup. Now that we have f_inode/file_inode() we can use it instead of vm_file->f_mapping->host. This should not make any difference for uprobes, but in theory this change is more correct. We use this inode as a key, to compare it with uprobe->inode set by uprobe_register(inode), and the caller uses d_inode. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/events/uprobes.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index a567c8c7ef31..26bc2e24e9e3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -758,7 +758,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new) down_write(&mm->mmap_sem); vma = find_vma(mm, info->vaddr); if (!vma || !valid_vma(vma, is_register) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) goto unlock; if (vma->vm_start > info->vaddr || @@ -917,7 +917,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) loff_t offset; if (!valid_vma(vma, false) || - vma->vm_file->f_mapping->host != uprobe->inode) + file_inode(vma->vm_file) != uprobe->inode) continue; offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -1010,7 +1010,7 @@ int uprobe_mmap(struct vm_area_struct *vma) if (no_uprobe_events() || !valid_vma(vma, true)) return 0; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); if (!inode) return 0; @@ -1041,7 +1041,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e struct inode *inode; struct rb_node *n; - inode = vma->vm_file->f_mapping->host; + inode = file_inode(vma->vm_file); min = vaddr_to_offset(vma, start); max = min + (end - start) - 1; @@ -1465,7 +1465,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) vma = find_vma(mm, bp_vaddr); if (vma && vma->vm_start <= bp_vaddr) { if (valid_vma(vma, false)) { - struct inode *inode = vma->vm_file->f_mapping->host; + struct inode *inode = file_inode(vma->vm_file); loff_t offset = vaddr_to_offset(vma, bp_vaddr); uprobe = find_uprobe(inode, offset); -- cgit v1.2.3 From 0908ad6e56b5a6e86745680bc324bdbfac64d0b6 Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Date: Fri, 22 Mar 2013 20:46:27 +0530 Subject: uprobes: Add trap variant helper Some architectures like powerpc have multiple variants of the trap instruction. Introduce an additional helper is_trap_insn() for run-time handling of non-uprobe traps on such architectures. While there, change is_swbp_at_addr() to is_trap_at_addr() for reading clarity. With this change, the uprobe registration path will supercede any trap instruction inserted at the requested location, while taking care of delivering the SIGTRAP for cases where the trap notification came in for an address without a uprobe. See [1] for a more detailed explanation. [1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2013-March/104771.html This change was suggested by Oleg Nesterov. Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 02b83db8e2c5..19612881399a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -100,6 +100,7 @@ struct uprobes_state { extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); +extern bool __weak is_trap_insn(uprobe_opcode_t *insn); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool); extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 26bc2e24e9e3..ca9012930ce7 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -173,6 +173,20 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn) return *insn == UPROBE_SWBP_INSN; } +/** + * is_trap_insn - check if instruction is breakpoint instruction. + * @insn: instruction to be checked. + * Default implementation of is_trap_insn + * Returns true if @insn is a breakpoint instruction. + * + * This function is needed for the case where an architecture has multiple + * trap instructions (like powerpc). + */ +bool __weak is_trap_insn(uprobe_opcode_t *insn) +{ + return is_swbp_insn(insn); +} + static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) { void *kaddr = kmap_atomic(page); @@ -185,6 +199,15 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t uprobe_opcode_t old_opcode; bool is_swbp; + /* + * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here. + * We do not check if it is any other 'trap variant' which could + * be conditional trap instruction such as the one powerpc supports. + * + * The logic is that we do not care if the underlying instruction + * is a trap variant; uprobes always wins over any other (gdb) + * breakpoint. + */ copy_opcode(page, vaddr, &old_opcode); is_swbp = is_swbp_insn(&old_opcode); @@ -204,7 +227,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * Expect the breakpoint instruction to be the smallest size instruction for * the architecture. If an arch has variable length instruction and the * breakpoint instruction is not of the smallest length instruction - * supported by that architecture then we need to modify is_swbp_at_addr and + * supported by that architecture then we need to modify is_trap_at_addr and * write_opcode accordingly. This would never be a problem for archs that * have fixed length instructions. */ @@ -550,7 +573,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file, goto out; ret = -ENOTSUPP; - if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) + if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn)) goto out; ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); @@ -1431,7 +1454,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm) clear_bit(MMF_HAS_UPROBES, &mm->flags); } -static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) +static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) { struct page *page; uprobe_opcode_t opcode; @@ -1452,7 +1475,8 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) copy_opcode(page, vaddr, &opcode); put_page(page); out: - return is_swbp_insn(&opcode); + /* This needs to return true for any variant of the trap insn */ + return is_trap_insn(&opcode); } static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) @@ -1472,7 +1496,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) } if (!uprobe) - *is_swbp = is_swbp_at_addr(mm, bp_vaddr); + *is_swbp = is_trap_at_addr(mm, bp_vaddr); } else { *is_swbp = -EFAULT; } -- cgit v1.2.3 From ab0d805c7b9089f9a9b291f33ab95301d6604868 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sun, 24 Mar 2013 18:24:37 +0100 Subject: uprobes: Turn copy_opcode() into copy_from_page() No functional changes. Rename copy_opcode() into copy_from_page() and add the new "int len" argument to make it more more generic for the new users. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/events/uprobes.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ca9012930ce7..850eb9eb179b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -187,10 +187,10 @@ bool __weak is_trap_insn(uprobe_opcode_t *insn) return is_swbp_insn(insn); } -static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) +static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len) { void *kaddr = kmap_atomic(page); - memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); + memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len); kunmap_atomic(kaddr); } @@ -208,7 +208,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t * is a trap variant; uprobes always wins over any other (gdb) * breakpoint. */ - copy_opcode(page, vaddr, &old_opcode); + copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE); is_swbp = is_swbp_insn(&old_opcode); if (is_swbp_insn(new_opcode)) { @@ -1472,7 +1472,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (result < 0) return result; - copy_opcode(page, vaddr, &opcode); + copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); put_page(page); out: /* This needs to return true for any variant of the trap insn */ -- cgit v1.2.3 From 2edb7b5574d447354202ba365fb8ca6a1bac1d1c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sun, 24 Mar 2013 18:37:48 +0100 Subject: uprobes: Change __copy_insn() to use copy_from_page() Change __copy_insn() to use copy_from_page() and simplify the code. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/events/uprobes.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 850eb9eb179b..ba6acfe7c0de 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -500,30 +500,21 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, unsigned long nbytes, loff_t offset) { struct page *page; - void *vaddr; - unsigned long off; - pgoff_t idx; if (!filp) return -EINVAL; if (!mapping->a_ops->readpage) return -EIO; - - idx = offset >> PAGE_CACHE_SHIFT; - off = offset & ~PAGE_MASK; - /* * Ensure that the page that has the original instruction is * populated and in page-cache. */ - page = read_mapping_page(mapping, idx, filp); + page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); if (IS_ERR(page)) return PTR_ERR(page); - vaddr = kmap_atomic(page); - memcpy(insn, vaddr + off, nbytes); - kunmap_atomic(vaddr); + copy_from_page(page, offset, insn, nbytes); page_cache_release(page); return 0; -- cgit v1.2.3 From 98763a1bb1515f8a8d7f1d9ae42604e19872364b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sun, 24 Mar 2013 18:45:44 +0100 Subject: uprobes: Kill the unnecesary filp != NULL check in __copy_insn() __copy_insn(filp) can only be called after valid_vma() returns T, vma->vm_file passed as "filp" can not be NULL. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/events/uprobes.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ba6acfe7c0de..093866547fe3 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -501,9 +501,6 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn, { struct page *page; - if (!filp) - return -EINVAL; - if (!mapping->a_ops->readpage) return -EIO; /* -- cgit v1.2.3 From 5669ccee21d87622f30a724b3fe0d04ec5b0afae Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sun, 24 Mar 2013 18:58:04 +0100 Subject: uprobes: Introduce copy_to_page() Extract the kmap_atomic/memcpy/kunmap_atomic code from xol_get_insn_slot() into the new simple helper, copy_to_page(). It will have more users soon. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/events/uprobes.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 093866547fe3..b8255eaca190 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -194,6 +194,13 @@ static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, in kunmap_atomic(kaddr); } +static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len) +{ + void *kaddr = kmap_atomic(page); + memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); + kunmap_atomic(kaddr); +} + static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *new_opcode) { uprobe_opcode_t old_opcode; @@ -1227,9 +1234,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area) static unsigned long xol_get_insn_slot(struct uprobe *uprobe) { struct xol_area *area; - unsigned long offset; unsigned long xol_vaddr; - void *vaddr; area = get_xol_area(); if (!area) @@ -1240,10 +1245,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) return 0; /* Initialize the slot */ - offset = xol_vaddr & ~PAGE_MASK; - vaddr = kmap_atomic(area->page); - memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); - kunmap_atomic(vaddr); + copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); /* * We probably need flush_icache_user_range() but it needs vma. * This should work on supported architectures too. -- cgit v1.2.3 From 3f47107c5c2972ca47f216889080f6ef818b25e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sun, 24 Mar 2013 19:04:36 +0100 Subject: uprobes: Change write_opcode() to use copy_*page() Change write_opcode() to use copy_highpage() + copy_to_page() and simplify the code. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/events/uprobes.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b8255eaca190..7312503caf2e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -255,7 +255,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t opcode) { struct page *old_page, *new_page; - void *vaddr_old, *vaddr_new; struct vm_area_struct *vma; int ret; @@ -276,15 +275,8 @@ retry: __SetPageUptodate(new_page); - /* copy the page now that we've got it stable */ - vaddr_old = kmap_atomic(old_page); - vaddr_new = kmap_atomic(new_page); - - memcpy(vaddr_new, vaddr_old, PAGE_SIZE); - memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE); - - kunmap_atomic(vaddr_new); - kunmap_atomic(vaddr_old); + copy_highpage(new_page, old_page); + copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = anon_vma_prepare(vma); if (ret) -- cgit v1.2.3 From 5c529597e922c26910fe49b8d5f93aeaca9a2415 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan <laijs@cn.fujitsu.com> Date: Thu, 4 Apr 2013 10:05:38 +0800 Subject: workqueue: avoid false negative WARN_ON() in destroy_workqueue() destroy_workqueue() performs several sanity checks before proceeding with destruction of a workqueue. One of the checks verifies that refcnt of each pwq (pool_workqueue) is over 1 as at that point there should be no in-flight work items and the only holder of pwq refs is the workqueue itself. This worked fine as a workqueue used to hold only one reference to its pwqs; however, since 4c16bd327c ("workqueue: implement NUMA affinity for unbound workqueues"), a workqueue may hold multiple references to its default pwq triggering this sanity check spuriously. Fix it by not triggering the pwq->refcnt assertion on default pwqs. An example spurious WARN trigger follows. WARNING: at kernel/workqueue.c:4201 destroy_workqueue+0x6a/0x13e() Hardware name: 4286C12 Modules linked in: sdhci_pci sdhci mmc_core usb_storage i915 drm_kms_helper drm i2c_algo_bit i2c_core video Pid: 361, comm: umount Not tainted 3.9.0-rc5+ #29 Call Trace: [<c04314a7>] warn_slowpath_common+0x7c/0x93 [<c04314e0>] warn_slowpath_null+0x22/0x24 [<c044796a>] destroy_workqueue+0x6a/0x13e [<c056dc01>] ext4_put_super+0x43/0x2c4 [<c04fb7b8>] generic_shutdown_super+0x4b/0xb9 [<c04fb848>] kill_block_super+0x22/0x60 [<c04fb960>] deactivate_locked_super+0x2f/0x56 [<c04fc41b>] deactivate_super+0x2e/0x31 [<c050f1e6>] mntput_no_expire+0x103/0x108 [<c050fdce>] sys_umount+0x2a2/0x2c4 [<c050fe0e>] sys_oldumount+0x1e/0x20 [<c085ba4d>] sysenter_do_call+0x12/0x38 tj: Rewrote description. Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Fengguang Wu <fengguang.wu@intel.com> --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dd2a4c49a39a..c273376ff73e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4201,7 +4201,7 @@ void destroy_workqueue(struct workqueue_struct *wq) } } - if (WARN_ON(pwq->refcnt > 1) || + if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || WARN_ON(pwq->nr_active) || WARN_ON(!list_empty(&pwq->delayed_works))) { mutex_unlock(&wq->mutex); -- cgit v1.2.3 From 8cc9934520e7f752fe45d5199664d741ba24a932 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 7 Apr 2013 09:29:50 -0700 Subject: cgroup, cpuset: replace move_member_tasks_to_cpuset() with cgroup_transfer_tasks() When a cpuset becomes empty (no CPU or memory), its tasks are transferred with the nearest ancestor with execution resources. This is implemented using cgroup_scan_tasks() with a callback which grabs cgroup_mutex and invokes cgroup_attach_task() on each task. Both cgroup_mutex and cgroup_attach_task() are scheduled to be unexported. Implement cgroup_transfer_tasks() in cgroup proper which is essentially the same as move_member_tasks_to_cpuset() except that it takes cgroups instead of cpusets and @to comes before @from like normal functions with those arguments, and replace move_member_tasks_to_cpuset() with it. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 28 +++++++++++++++++++++++++++ kernel/cpuset.c | 51 ++++++-------------------------------------------- 3 files changed, 35 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 01c48c6806d6..f8eb01d75ddc 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -696,6 +696,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, bool threadgroup); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); /* * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4aee5bdd66c8..147d7ccb3b0b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3269,6 +3269,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } +static void cgroup_transfer_one_task(struct task_struct *task, + struct cgroup_scanner *scan) +{ + struct cgroup *new_cgroup = scan->data; + + cgroup_lock(); + cgroup_attach_task(new_cgroup, task, false); + cgroup_unlock(); +} + +/** + * cgroup_trasnsfer_tasks - move tasks from one cgroup to another + * @to: cgroup to which the tasks will be moved + * @from: cgroup in which the tasks currently reside + */ +int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) +{ + struct cgroup_scanner scan; + + scan.cg = from; + scan.test_task = NULL; /* select all tasks in cgroup */ + scan.process_task = cgroup_transfer_one_task; + scan.heap = NULL; + scan.data = to; + + return cgroup_scan_tasks(&scan); +} + /* * Stuff for reading the 'tasks'/'procs' files. * diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 98d458aad789..866d78ee7930 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1994,50 +1994,6 @@ int __init cpuset_init(void) return 0; } -/** - * cpuset_do_move_task - move a given task to another cpuset - * @tsk: pointer to task_struct the task to move - * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner - * - * Called by cgroup_scan_tasks() for each task in a cgroup. - * Return nonzero to stop the walk through the tasks. - */ -static void cpuset_do_move_task(struct task_struct *tsk, - struct cgroup_scanner *scan) -{ - struct cgroup *new_cgroup = scan->data; - - cgroup_lock(); - cgroup_attach_task(new_cgroup, tsk, false); - cgroup_unlock(); -} - -/** - * move_member_tasks_to_cpuset - move tasks from one cpuset to another - * @from: cpuset in which the tasks currently reside - * @to: cpuset to which the tasks will be moved - * - * Called with cpuset_mutex held - * callback_mutex must not be held, as cpuset_attach() will take it. - * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, - * calling callback functions for each. - */ -static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) -{ - struct cgroup_scanner scan; - - scan.cg = from->css.cgroup; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cpuset_do_move_task; - scan.heap = NULL; - scan.data = to->css.cgroup; - - if (cgroup_scan_tasks(&scan)) - printk(KERN_ERR "move_member_tasks_to_cpuset: " - "cgroup_scan_tasks failed\n"); -} - /* * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, @@ -2058,7 +2014,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) nodes_empty(parent->mems_allowed)) parent = parent_cs(parent); - move_member_tasks_to_cpuset(cs, parent); + if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) { + rcu_read_lock(); + printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n", + cgroup_name(cs->css.cgroup)); + rcu_read_unlock(); + } } /** -- cgit v1.2.3 From 7ae1bad99e27b8838d480a24edf4646a2fc547df Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 7 Apr 2013 09:29:51 -0700 Subject: cgroup: relocate cgroup_lock_live_group() and cgroup_attach_task_all() cgroup_lock_live_group() and cgroup_attach_task() are scheduled to be made static. Relocate the former and cgroup_attach_task_all() so that we don't need forward declarations. This patch is pure relocation. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> --- kernel/cgroup.c | 84 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 147d7ccb3b0b..ae7617095dec 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -329,6 +329,24 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return __d_cfe(dentry)->type; } +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the lock should be later released with + * cgroup_unlock(). On failure returns false with no lock held. + */ +bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(cgroup_lock_live_group); + /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); @@ -1943,30 +1961,6 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp, put_css_set(oldcg); } -/** - * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' - * @from: attach to all cgroups of a given task - * @tsk: the task to be attached - */ -int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) -{ - struct cgroupfs_root *root; - int retval = 0; - - cgroup_lock(); - for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); - - retval = cgroup_attach_task(from_cg, tsk, false); - if (retval) - break; - } - cgroup_unlock(); - - return retval; -} -EXPORT_SYMBOL_GPL(cgroup_attach_task_all); - /** * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2204,6 +2198,30 @@ out_unlock_cgroup: return ret; } +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroupfs_root *root; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk, false); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { return attach_task_by_pid(cgrp, pid, false); @@ -2214,24 +2232,6 @@ static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) return attach_task_by_pid(cgrp, tgid, true); } -/** - * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. - * @cgrp: the cgroup to be checked for liveness - * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. - */ -bool cgroup_lock_live_group(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_mutex); - if (cgroup_is_removed(cgrp)) { - mutex_unlock(&cgroup_mutex); - return false; - } - return true; -} -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); - static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { -- cgit v1.2.3 From b9777cf8d7c7854c3c38bd6621d993b85c2afcdf Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 7 Apr 2013 09:29:51 -0700 Subject: cgroup: unexport locking interface and cgroup_attach_task() Now that all external cgroup_lock() users are gone, we can finally unexport the locking interface and prevent future abuse of cgroup_mutex. Make cgroup_[un]lock() and cgroup_lock_live_group() static. Also, cgroup_attach_task() doesn't have any user left and can't be used without locking interface anyway. Make it static too. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> --- include/linux/cgroup.h | 5 ----- kernel/cgroup.c | 9 +++------ 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index f8eb01d75ddc..63deb70f3149 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -30,10 +30,7 @@ struct css_id; extern int cgroup_init_early(void); extern int cgroup_init(void); -extern void cgroup_lock(void); extern int cgroup_lock_is_held(void); -extern bool cgroup_lock_live_group(struct cgroup *cgrp); -extern void cgroup_unlock(void); extern void cgroup_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); @@ -693,8 +690,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, struct cgroup_iter *it); void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, - bool threadgroup); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae7617095dec..32ca0304452f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -336,7 +336,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * On success, returns true; the lock should be later released with * cgroup_unlock(). On failure returns false with no lock held. */ -bool cgroup_lock_live_group(struct cgroup *cgrp) +static bool cgroup_lock_live_group(struct cgroup *cgrp) { mutex_lock(&cgroup_mutex); if (cgroup_is_removed(cgrp)) { @@ -345,7 +345,6 @@ bool cgroup_lock_live_group(struct cgroup *cgrp) } return true; } -EXPORT_SYMBOL_GPL(cgroup_lock_live_group); /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ @@ -824,22 +823,20 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * cgroup_lock - lock out any changes to cgroup structures * */ -void cgroup_lock(void) +static void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } -EXPORT_SYMBOL_GPL(cgroup_lock); /** * cgroup_unlock - release lock on cgroup changes * * Undo the lock taken in a previous cgroup_lock() call. */ -void cgroup_unlock(void) +static void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } -EXPORT_SYMBOL_GPL(cgroup_unlock); /* * A couple of forward declarations required, due to cyclic reference loop: -- cgit v1.2.3 From 47cfcd0922454e49f4923b1e2d31a5bf199237c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 7 Apr 2013 09:29:51 -0700 Subject: cgroup: kill cgroup_[un]lock() Now that locking interface is unexported, there's no reason to keep around these thin wrappers. Kill them and use mutex operations directly. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> --- kernel/cgroup.c | 41 +++++++++++------------------------------ 1 file changed, 11 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 32ca0304452f..1a65958c1a0b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -333,8 +333,8 @@ static inline struct cftype *__d_cft(struct dentry *dentry) * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness * - * On success, returns true; the lock should be later released with - * cgroup_unlock(). On failure returns false with no lock held. + * On success, returns true; the mutex should be later unlocked. On + * failure returns false with no lock held. */ static bool cgroup_lock_live_group(struct cgroup *cgrp) { @@ -819,25 +819,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * update of a tasks cgroup pointer by cgroup_attach_task() */ -/** - * cgroup_lock - lock out any changes to cgroup structures - * - */ -static void cgroup_lock(void) -{ - mutex_lock(&cgroup_mutex); -} - -/** - * cgroup_unlock - release lock on cgroup changes - * - * Undo the lock taken in a previous cgroup_lock() call. - */ -static void cgroup_unlock(void) -{ - mutex_unlock(&cgroup_mutex); -} - /* * A couple of forward declarations required, due to cyclic reference loop: * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> @@ -1967,8 +1948,8 @@ static void cgroup_task_migrate(struct cgroup *oldcgrp, * Call holding cgroup_mutex and the group_rwsem of the leader. Will take * task_lock of @tsk or each thread in the threadgroup individually in turn. */ -int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, - bool threadgroup) +static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, + bool threadgroup) { int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; @@ -2191,7 +2172,7 @@ retry_find_task: put_task_struct(tsk); out_unlock_cgroup: - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return ret; } @@ -2205,7 +2186,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) struct cgroupfs_root *root; int retval = 0; - cgroup_lock(); + mutex_lock(&cgroup_mutex); for_each_active_root(root) { struct cgroup *from_cg = task_cgroup_from_root(from, root); @@ -2213,7 +2194,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return retval; } @@ -2240,7 +2221,7 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, mutex_lock(&cgroup_root_mutex); strcpy(cgrp->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return 0; } @@ -2251,7 +2232,7 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); seq_putc(seq, '\n'); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); return 0; } @@ -3271,9 +3252,9 @@ static void cgroup_transfer_one_task(struct task_struct *task, { struct cgroup *new_cgroup = scan->data; - cgroup_lock(); + mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); - cgroup_unlock(); + mutex_unlock(&cgroup_mutex); } /** -- cgit v1.2.3 From 2219449a65ace0290cd9c2260ff337e326b8be8a Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 7 Apr 2013 09:29:51 -0700 Subject: cgroup: remove cgroup_lock_is_held() We don't want controllers to assume that the information is officially available and do funky things with it. The only user is task_subsys_state_check() which uses it to verify RCU access context. We can move cgroup_lock_is_held() inside CONFIG_PROVE_RCU but that doesn't add meaningful protection compared to conditionally exposing cgroup_mutex. Remove cgroup_lock_is_held(), export cgroup_mutex iff CONFIG_PROVE_RCU and use lockdep_is_held() directly on the mutex in task_subsys_state_check(). While at it, add parentheses around macro arguments in task_subsys_state_check(). Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Li Zefan <lizefan@huawei.com> --- include/linux/cgroup.h | 13 +++++++++---- kernel/cgroup.c | 20 ++++++-------------- 2 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 63deb70f3149..515927eebb37 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -30,7 +30,6 @@ struct css_id; extern int cgroup_init_early(void); extern int cgroup_init(void); -extern int cgroup_lock_is_held(void); extern void cgroup_fork(struct task_struct *p); extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); @@ -552,10 +551,16 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state( * rcu_dereference_check() conditions, such as locks used during the * cgroup_subsys::attach() methods. */ +#ifdef CONFIG_PROVE_RCU +extern struct mutex cgroup_mutex; #define task_subsys_state_check(task, subsys_id, __c) \ - rcu_dereference_check(task->cgroups->subsys[subsys_id], \ - lockdep_is_held(&task->alloc_lock) || \ - cgroup_lock_is_held() || (__c)) + rcu_dereference_check((task)->cgroups->subsys[(subsys_id)], \ + lockdep_is_held(&(task)->alloc_lock) || \ + lockdep_is_held(&cgroup_mutex) || (__c)) +#else +#define task_subsys_state_check(task, subsys_id, __c) \ + rcu_dereference((task)->cgroups->subsys[(subsys_id)]) +#endif static inline struct cgroup_subsys_state * task_subsys_state(struct task_struct *task, int subsys_id) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1a65958c1a0b..ba3e24a76dae 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -83,7 +83,13 @@ * B happens only through cgroup_show_options() and using cgroup_root_mutex * breaks it. */ +#ifdef CONFIG_PROVE_RCU +DEFINE_MUTEX(cgroup_mutex); +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +#else static DEFINE_MUTEX(cgroup_mutex); +#endif + static DEFINE_MUTEX(cgroup_root_mutex); /* @@ -251,20 +257,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add); -#ifdef CONFIG_PROVE_LOCKING -int cgroup_lock_is_held(void) -{ - return lockdep_is_held(&cgroup_mutex); -} -#else /* #ifdef CONFIG_PROVE_LOCKING */ -int cgroup_lock_is_held(void) -{ - return mutex_is_locked(&cgroup_mutex); -} -#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ - -EXPORT_SYMBOL_GPL(cgroup_lock_is_held); - static int css_unbias_refcnt(int refcnt) { return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; -- cgit v1.2.3 From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Sat, 6 Apr 2013 10:10:27 +0200 Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems The sched_clock_remote() implementation has the following inatomicity problem on 32bit systems when accessing the remote scd->clock, which is a 64bit value. CPU0 CPU1 sched_clock_local() sched_clock_remote(CPU0) ... remote_clock = scd[CPU0]->clock read_low32bit(scd[CPU0]->clock) cmpxchg64(scd->clock,...) read_high32bit(scd[CPU0]->clock) While the update of scd->clock is using an atomic64 mechanism, the readout on the remote cpu is not, which can cause completely bogus readouts. It is a quite rare problem, because it requires the update to hit the narrow race window between the low/high readout and the update must go across the 32bit boundary. The resulting misbehaviour is, that CPU1 will see the sched_clock on CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value to this bogus timestamp. This stays that way due to the clamping implementation for about 4 seconds until the synchronization with CLOCK_MONOTONIC undoes the problem. The issue is hard to observe, because it might only result in a less accurate SCHED_OTHER timeslicing behaviour. To create observable damage on realtime scheduling classes, it is necessary that the bogus update of CPU1 sched_clock happens in the context of an realtime thread, which then gets charged 4 seconds of RT runtime, which results in the RT throttler mechanism to trigger and prevent scheduling of RT tasks for a little less than 4 seconds. So this is quite unlikely as well. The issue was quite hard to decode as the reproduction time is between 2 days and 3 weeks and intrusive tracing makes it less likely, but the following trace recorded with trace_clock=global, which uses sched_clock_local(), gave the final hint: <idle>-0 0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80 <idle>-0 0d..30 400269.477151: hrtimer_start: hrtimer=0xf7061e80 ... irq/20-S-587 1d..32 400273.772118: sched_wakeup: comm= ... target_cpu=0 <idle>-0 0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80 What happens is that CPU0 goes idle and invokes sched_clock_idle_sleep_event() which invokes sched_clock_local() and CPU1 runs a remote wakeup for CPU0 at the same time, which invokes sched_remote_clock(). The time jump gets propagated to CPU0 via sched_remote_clock() and stays stale on both cores for ~4 seconds. There are only two other possibilities, which could cause a stale sched clock: 1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic wrong value. 2) sched_clock() which reads the TSC returns a sporadic wrong value. #1 can be excluded because sched_clock would continue to increase for one jiffy and then go stale. #2 can be excluded because it would not make the clock jump forward. It would just result in a stale sched_clock for one jiffy. After quite some brain twisting and finding the same pattern on other traces, sched_clock_remote() remained the only place which could cause such a problem and as explained above it's indeed racy on 32bit systems. So while on 64bit systems the readout is atomic, we need to verify the remote readout on 32bit machines. We need to protect the local->clock readout in sched_clock_remote() on 32bit as well because an NMI could hit between the low and the high readout, call sched_clock_local() and modify local->clock. Thanks to Siegfried Wulsch for bearing with my debug requests and going through the tedious tasks of running a bunch of reproducer systems to generate the debug information which let me decode the issue. Reported-by: Siegfried Wulsch <Siegfried.Wulsch@rovema.de> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Steven Rostedt <rostedt@goodmis.org> Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: stable@vger.kernel.org --- kernel/sched/clock.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks -- cgit v1.2.3 From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001 From: libin <huawei.libin@huawei.com> Date: Mon, 8 Apr 2013 14:39:12 +0800 Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on sysctl") was an incomplete bug fix. This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1] avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX on sysctl. Signed-off-by: Libin <huawei.libin@huawei.com> Cc: <jiang.liu@huawei.com> Cc: <guohanjun@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 306943f531a3..fa077929e315 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, -- cgit v1.2.3 From c97847d2f0eb77c806e650e04d9bbcf79fa05730 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Mon, 8 Apr 2013 11:48:27 +0800 Subject: perf: Fix strncpy() use, always make sure it's NUL terminated For NUL terminated string, always make sure that there's '\0' at the end. In our case we need a return value, so still use strncpy() and fix up the tail explicitly. (strlcpy() returns the size, not the pointer) Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: a.p.zijlstra@chello.nl <a.p.zijlstra@chello.nl> Cc: paulus@samba.org <paulus@samba.org> Cc: acme@ghostprotocols.net <acme@ghostprotocols.net> Link: http://lkml.kernel.org/r/51623E0B.7070101@asianux.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 59412d037eed..7f0d67ea3f48 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4737,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); + sizeof(tmp) - 1); + tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } -- cgit v1.2.3 From 67012ab1d2ce871afea4ee55408f233f97d09d07 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Mon, 8 Apr 2013 12:06:44 +0800 Subject: perf: Fix strncpy() use, use strlcpy() instead of strncpy() For NUL terminated string we always need to set '\0' at the end. Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: rostedt@goodmis.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Link: http://lkml.kernel.org/r/51624254.30301@asianux.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade56981..3f5046a4d81b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -132,7 +132,7 @@ static char *default_bootup_tracer; static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = 1; @@ -162,7 +162,7 @@ static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { - strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); trace_boot_options = trace_boot_options_buf; return 0; } -- cgit v1.2.3 From 75761cc15877c155b3849b4e0e0cb3f897faf471 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Mon, 8 Apr 2013 12:12:39 +0800 Subject: ftrace: Fix strncpy() use, use strlcpy() instead of strncpy() For NUL terminated string we always need to set '\0' at the end. Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: rostedt@goodmis.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Link: http://lkml.kernel.org/r/516243B7.9020405@asianux.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/trace/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf08..db143742704b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3441,14 +3441,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); -- cgit v1.2.3 From 28b4a521f618d9722bc780ea38b44718ce0fe283 Mon Sep 17 00:00:00 2001 From: Viresh Kumar <viresh.kumar@linaro.org> Date: Fri, 5 Apr 2013 16:26:46 +0530 Subject: sched: Fix typo inside comment Fix typo: sched_domains_nume_distance -> sched_domains_numa_distance Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org> Cc: linaro-kernel@lists.linaro.org Cc: patches@linaro.org Cc: robin.randhawa@arm.com Cc: Steve.Bannister@arm.com Cc: Liviu.Dudau@arm.com Cc: charles.garcia-tobin@arm.com Cc: arvind.chauhan@arm.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/cd8084746ac932106d6fa6be388b8f2d6aa9617c.1365159023.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 849deb96e61e..f5e1aa5b2684 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6252,7 +6252,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ -- cgit v1.2.3 From ee761f629d598579594d7e1eb8c552f3c5f71e4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 21 Mar 2013 22:49:32 +0100 Subject: arch: Consolidate tsk_is_polling() Move it to a common place. Preparatory patch for implementing set/clear for the idle need_resched poll implementation. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: Magnus Damm <magnus.damm@gmail.com> Link: http://lkml.kernel.org/r/20130321215233.446034505@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/alpha/include/asm/thread_info.h | 2 -- arch/ia64/include/asm/thread_info.h | 2 -- arch/metag/include/asm/thread_info.h | 2 -- arch/microblaze/include/asm/thread_info.h | 1 - arch/mn10300/include/asm/thread_info.h | 2 -- arch/openrisc/include/asm/thread_info.h | 2 -- arch/parisc/include/asm/thread_info.h | 2 -- arch/powerpc/include/asm/thread_info.h | 2 -- arch/sh/include/asm/thread_info.h | 2 -- arch/sparc/include/asm/thread_info_32.h | 2 -- arch/sparc/include/asm/thread_info_64.h | 2 -- arch/tile/include/asm/thread_info.h | 2 -- arch/x86/include/asm/thread_info.h | 2 -- include/linux/sched.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 5 ----- 15 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 1f8c72959fb6..52cd2a4a3ff4 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -95,8 +95,6 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TS_POLLING 0x0010 /* idle task polling need_resched, skip sending interrupt */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 020d655ed082..cade13dd0299 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -131,8 +131,6 @@ struct thread_info { #define TS_POLLING 1 /* true if in idle loop and not sleeping */ #define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index 0ecd34d8b5f6..7c4a33006142 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -150,6 +150,4 @@ static inline int kstack_end(void *addr) #define _TIF_WORK_MASK (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \ _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 008f30433d22..de26ea6373de 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -182,7 +182,6 @@ static inline bool test_and_clear_restore_sigmask(void) ti->status &= ~TS_RESTORE_SIGMASK; return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif #endif /* __KERNEL__ */ diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index f90062b0622d..224b4262486d 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -165,8 +165,6 @@ void arch_release_thread_info(struct thread_info *ti); #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 07f3212422ad..d797acc901e4 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -128,8 +128,6 @@ register struct thread_info *current_thread_info_reg asm("r10"); /* For OpenRISC, this is anything in the LSW other than syscall trace */ #define _TIF_WORK_MASK (0xff & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP)) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index d1fb79a36f3d..6182832e5b6c 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -77,8 +77,6 @@ struct thread_info { #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ _TIF_BLOCKSTEP) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_PARISC_THREAD_INFO_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 406b7b9a1341..8ceea14d6fe4 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -182,8 +182,6 @@ static inline bool test_thread_local_flags(unsigned int flags) #define is_32bit_task() (1) #endif -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 7d5ac4e48485..45a93669289d 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -207,8 +207,6 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 25849ae3e900..dd3807599bb9 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -132,8 +132,6 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ _TIF_SIGPENDING) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 269bd92313df..d5e504251079 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -256,8 +256,6 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #define thread32_stack_is_64bit(__SP) (((__SP) & 0x1) != 0) #define test_thread_64bit_stack(__SP) \ ((test_thread_flag(TIF_32BIT) && !thread32_stack_is_64bit(__SP)) ? \ diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index e9c670d7a7fe..ccc8ef37235c 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -153,8 +153,6 @@ extern void _cpu_idle(void); #define TS_POLLING 0x0004 /* in idle loop but not sleeping */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2cd056e3ada3..a1df6e84691f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -241,8 +241,6 @@ static inline struct thread_info *current_thread_info(void) skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..6709a5813f27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2621,6 +2621,26 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Idle thread specific functions to determine the need_resched + * polling state. We have two versions, one based on TS_POLLING in + * thread_info.status and one based on TIF_POLLING_NRFLAG in + * thread_info.flags + */ +#ifdef TS_POLLING +static inline int tsk_is_polling(struct task_struct *p) +{ + return task_thread_info(p)->status & TS_POLLING; +} +#elif defined(TIF_POLLING_NRFLAG) +static inline int tsk_is_polling(struct task_struct *p) +{ + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +} +#else +static inline int tsk_is_polling(struct task_struct *p) { return 0; } +#endif + /* * Thread group CPU time accounting. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..243a20c5cf91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; -- cgit v1.2.3 From a1a04ec3c7c27a682473fd9beb2c996316a64649 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 21 Mar 2013 22:49:34 +0100 Subject: idle: Provide a generic entry point for the idle code For now this calls cpu_idle(), but in the long run we want to move the cpu bringup code to the core and therefor we add a state argument. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: Magnus Damm <magnus.damm@gmail.com> Link: http://lkml.kernel.org/r/20130321215233.583190032@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- include/linux/cpu.h | 8 ++++++++ init/main.c | 2 +- kernel/Makefile | 1 + kernel/cpu/Makefile | 1 + kernel/cpu/idle.c | 10 ++++++++++ 5 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 kernel/cpu/Makefile create mode 100644 kernel/cpu/idle.c (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index ce7a074f2519..7419e30c55fb 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -212,4 +212,12 @@ static inline int disable_nonboot_cpus(void) { return 0; } static inline void enable_nonboot_cpus(void) {} #endif /* !CONFIG_PM_SLEEP_SMP */ +enum cpuhp_state { + CPUHP_OFFLINE, + CPUHP_ONLINE, +}; + +void cpu_startup_entry(enum cpuhp_state state); +void cpu_idle(void); + #endif /* _LINUX_CPU_H_ */ diff --git a/init/main.c b/init/main.c index 63534a141b4e..adb179d3e0f8 100644 --- a/init/main.c +++ b/init/main.c @@ -384,7 +384,7 @@ static noinline void __init_refok rest_init(void) init_idle_bootup_task(current); schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ - cpu_idle(); + cpu_startup_entry(CPUHP_ONLINE); } /* Check for early params. */ diff --git a/kernel/Makefile b/kernel/Makefile index bbde5f1a4486..d1574d47cf27 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -24,6 +24,7 @@ endif obj-y += sched/ obj-y += power/ +obj-y += cpu/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile new file mode 100644 index 000000000000..59ab052ef7a0 --- /dev/null +++ b/kernel/cpu/Makefile @@ -0,0 +1 @@ +obj-y = idle.o diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c new file mode 100644 index 000000000000..1908f00e0e98 --- /dev/null +++ b/kernel/cpu/idle.c @@ -0,0 +1,10 @@ +/* + * Generic entry point for the idle threads + */ +#include <linux/sched.h> +#include <linux/cpu.h> + +void cpu_startup_entry(enum cpuhp_state state) +{ + cpu_idle(); +} -- cgit v1.2.3 From d166991234347215dc23fc9dc15a63a83a1a54e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Thu, 21 Mar 2013 22:49:35 +0100 Subject: idle: Implement generic idle function All idle functions in arch/* are more or less the same, plus minus a few bugs and extra instrumentation, tickless support and other optional items. Implement a generic idle function which resembles the functionality found in arch/. Provide weak arch_cpu_idle_* functions which can be overridden by the architecture code if needed. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Paul McKenney <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: Magnus Damm <magnus.damm@gmail.com> Link: http://lkml.kernel.org/r/20130321215233.646635455@linutronix.de Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/Kconfig | 3 ++ include/linux/cpu.h | 8 ++++ kernel/cpu/idle.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 1455579791ec..a699f3767be4 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -216,6 +216,9 @@ config USE_GENERIC_SMP_HELPERS config GENERIC_SMP_IDLE_THREAD bool +config GENERIC_IDLE_LOOP + bool + # Select if arch init_task initializer is different to init/init_task.c config ARCH_INIT_TASK bool diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 7419e30c55fb..c6f6e0839b61 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -220,4 +220,12 @@ enum cpuhp_state { void cpu_startup_entry(enum cpuhp_state state); void cpu_idle(void); +void cpu_idle_poll_ctrl(bool enable); + +void arch_cpu_idle(void); +void arch_cpu_idle_prepare(void); +void arch_cpu_idle_enter(void); +void arch_cpu_idle_exit(void); +void arch_cpu_idle_dead(void); + #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 1908f00e0e98..54c320383934 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -3,8 +3,113 @@ */ #include <linux/sched.h> #include <linux/cpu.h> +#include <linux/tick.h> +#include <linux/mm.h> +#include <asm/tlb.h> + +#include <trace/events/power.h> + +#ifndef CONFIG_GENERIC_IDLE_LOOP void cpu_startup_entry(enum cpuhp_state state) { cpu_idle(); } +#else + +static int __read_mostly cpu_idle_force_poll; + +void cpu_idle_poll_ctrl(bool enable) +{ + if (enable) { + cpu_idle_force_poll++; + } else { + cpu_idle_force_poll--; + WARN_ON_ONCE(cpu_idle_force_poll < 0); + } +} + +#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP +static int __init cpu_idle_poll_setup(char *__unused) +{ + cpu_idle_force_poll = 1; + return 1; +} +__setup("nohlt", cpu_idle_poll_setup); + +static int __init cpu_idle_nopoll_setup(char *__unused) +{ + cpu_idle_force_poll = 0; + return 1; +} +__setup("hlt", cpu_idle_nopoll_setup); +#endif + +static inline int cpu_idle_poll(void) +{ + trace_cpu_idle_rcuidle(0, smp_processor_id()); + local_irq_enable(); + while (!need_resched()) + cpu_relax(); + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); + return 1; +} + +/* Weak implementations for optional arch specific functions */ +void __weak arch_cpu_idle_prepare(void) { } +void __weak arch_cpu_idle_enter(void) { } +void __weak arch_cpu_idle_exit(void) { } +void __weak arch_cpu_idle_dead(void) { } +void __weak arch_cpu_idle(void) +{ + cpu_idle_force_poll = 1; +} + +/* + * Generic idle loop implementation + */ +static void cpu_idle_loop(void) +{ + while (1) { + tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); + rmb(); + + if (cpu_is_offline(smp_processor_id())) + arch_cpu_idle_dead(); + + local_irq_disable(); + arch_cpu_idle_enter(); + + if (cpu_idle_force_poll) { + cpu_idle_poll(); + } else { + current_clr_polling(); + if (!need_resched()) { + stop_critical_timings(); + rcu_idle_enter(); + arch_cpu_idle(); + WARN_ON_ONCE(irqs_disabled()); + rcu_idle_exit(); + start_critical_timings(); + } else { + local_irq_enable(); + } + current_set_polling(); + } + arch_cpu_idle_exit(); + } + tick_nohz_idle_exit(); + schedule_preempt_disabled(); + } +} + +void cpu_startup_entry(enum cpuhp_state state) +{ + current_set_polling(); + arch_cpu_idle_prepare(); + cpu_idle_loop(); +} +#endif -- cgit v1.2.3 From bfaf4af8abf92b883c04d137a0c18245cc9d51a2 Mon Sep 17 00:00:00 2001 From: Hong Zhiguo <honkiko@gmail.com> Date: Thu, 4 Apr 2013 15:01:21 +0800 Subject: lockdep: Remove unnecessary 'hlock_next' variable Signed-off-by: Hong Zhiguo <honkiko@gmail.com> Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1365058881-4044-1-git-send-email-honkiko@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/lockdep.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8a0efac4f99d..e5deddadeab1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2000,7 +2000,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr, *hlock_next; + struct held_lock *hlock_curr; int i, j; /* @@ -2057,12 +2057,10 @@ cache_hit: chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; /* Find the first held_lock of current chain */ - hlock_next = hlock; for (i = curr->lockdep_depth - 1; i >= 0; i--) { hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock_next->irq_context) + if (hlock_curr->irq_context != hlock->irq_context) break; - hlock_next = hlock; } i++; chain->depth = curr->lockdep_depth + 1 - i; -- cgit v1.2.3 From e614b3332a4f3f264a26da28e5a1f4cc3aea3974 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka <sgruszka@redhat.com> Date: Thu, 4 Apr 2013 10:57:48 +0200 Subject: sched/cputime: Fix accounting on multi-threaded processes Recent commit 6fac4829 ("cputime: Use accessors to read task cputime stats") introduced a bug, where we account many times the cputime of the first thread, instead of cputimes of all the different threads. Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/20130404085740.GA2495@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f4..e93cca92f38b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - task_cputime(tsk, &utime, &stime); + task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); -- cgit v1.2.3 From 2930e04d00e113ae24bb2b7c2b58de7b648a62c7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Tue, 26 Mar 2013 17:33:00 -0400 Subject: tracing: Fix race with update_max_tr_single and changing tracers The commit 34600f0e9 "tracing: Fix race with max_tr and changing tracers" fixed the updating of the main buffers with the race of changing tracers, but left out the fix to the updating of just a per cpu buffer. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4f1dade56981..7ba7fc76f9eb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) + if (!current_trace->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); -- cgit v1.2.3 From 5000c418840b309251c5887f0b56503aae30f84c Mon Sep 17 00:00:00 2001 From: Jan Kiszka <jan.kiszka@siemens.com> Date: Tue, 26 Mar 2013 17:53:03 +0100 Subject: ftrace: Consistently restore trace function on sysctl enabling If we reenable ftrace via syctl, we currently set ftrace_trace_function based on the previous simplistic algorithm. This is inconsistent with what update_ftrace_function does. So better call that helper instead. Link: http://lkml.kernel.org/r/5151D26F.1070702@siemens.com Cc: stable@vger.kernel.org Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6893d5a2bf08..cc4943c7ce6d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) { - if (ftrace_ops_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_ops_list->func; - else - ftrace_trace_function = ftrace_ops_list_func; - } + if (ftrace_ops_list != &ftrace_list_end) + update_ftrace_function(); } else { /* stopping ftrace calls (just send to ftrace_stub) */ -- cgit v1.2.3 From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Wed, 27 Mar 2013 09:31:28 -0400 Subject: ftrace: Do not call stub functions in control loop The function tracing control loop used by perf spits out a warning if the called function is not a control function. This is because the control function references a per cpu allocated data structure on struct ftrace_ops that is not allocated for other types of functions. commit 0a016409e42 "ftrace: Optimize the function tracer list loop" Had an optimization done to all function tracing loops to optimize for a single registered ops. Unfortunately, this allows for a slight race when tracing starts or ends, where the stub function might be called after the current registered ops is removed. In this case we get the following dump: root# perf stat -e ftrace:function sleep 1 [ 74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0() [ 74.349522] Hardware name: PRIMERGY RX200 S6 [ 74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod [ 74.446233] Pid: 1377, comm: perf Tainted: G W 3.9.0-rc1 #1 [ 74.453458] Call Trace: [ 74.456233] [<ffffffff81062e3f>] warn_slowpath_common+0x7f/0xc0 [ 74.462997] [<ffffffff810fbc60>] ? rcu_note_context_switch+0xa0/0xa0 [ 74.470272] [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0 [ 74.478117] [<ffffffff81062e9a>] warn_slowpath_null+0x1a/0x20 [ 74.484681] [<ffffffff81102ede>] ftrace_ops_control_func+0xde/0xf0 [ 74.491760] [<ffffffff8162f400>] ftrace_call+0x5/0x2f [ 74.497511] [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f [ 74.503486] [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f [ 74.509500] [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50 [ 74.516088] [<ffffffff816254d5>] ? _cond_resched+0x5/0x40 [ 74.522268] [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50 [ 74.528837] [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0 [ 74.536696] [<ffffffff816254d5>] ? _cond_resched+0x5/0x40 [ 74.542878] [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50 [ 74.548869] [<ffffffff81105c67>] unregister_ftrace_function+0x27/0x50 [ 74.556243] [<ffffffff8111eadf>] perf_ftrace_event_register+0x9f/0x140 [ 74.563709] [<ffffffff816254d5>] ? _cond_resched+0x5/0x40 [ 74.569887] [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50 [ 74.575898] [<ffffffff8111e94e>] perf_trace_destroy+0x2e/0x50 [ 74.582505] [<ffffffff81127ba9>] tp_perf_event_destroy+0x9/0x10 [ 74.589298] [<ffffffff811295d0>] free_event+0x70/0x1a0 [ 74.595208] [<ffffffff8112a579>] perf_event_release_kernel+0x69/0xa0 [ 74.602460] [<ffffffff816254d5>] ? _cond_resched+0x5/0x40 [ 74.608667] [<ffffffff8112a640>] put_event+0x90/0xc0 [ 74.614373] [<ffffffff8112a740>] perf_release+0x10/0x20 [ 74.620367] [<ffffffff811a3044>] __fput+0xf4/0x280 [ 74.625894] [<ffffffff811a31de>] ____fput+0xe/0x10 [ 74.631387] [<ffffffff81083697>] task_work_run+0xa7/0xe0 [ 74.637452] [<ffffffff81014981>] do_notify_resume+0x71/0xb0 [ 74.643843] [<ffffffff8162fa92>] int_signal+0x12/0x17 To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end ftrace_ops stub as just that, a stub. This flag is now checked in the control loop and the function is not called if the flag is set. Thanks to Jovi for not just reporting the bug, but also pointing out where the bug was in the code. Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com Tested-by: WANG Chao <chaowang@redhat.com> Reported-by: WANG Chao <chaowang@redhat.com> Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace.h | 2 ++ kernel/trace/ftrace.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index e5ca8ef50e9b..167abf907802 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -89,6 +89,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, * that the call back has its own recursion protection. If it does * not set this, then the ftrace infrastructure will add recursion * protection for the caller. + * STUB - The ftrace_ops is just a place holder. */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, @@ -98,6 +99,7 @@ enum { FTRACE_OPS_FL_SAVE_REGS = 1 << 4, FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = 1 << 5, FTRACE_OPS_FL_RECURSION_SAFE = 1 << 6, + FTRACE_OPS_FL_STUB = 1 << 7, }; struct ftrace_ops { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index cc4943c7ce6d..7e897106b7e0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,7 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); do_for_each_ftrace_op(op, ftrace_control_list) { - if (!ftrace_function_local_disabled(op) && + if (!(op->flags & FTRACE_OPS_FL_STUB) && + !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); -- cgit v1.2.3 From 6f389a8f1dd22a24f3d9afc2812b30d639e94625 Mon Sep 17 00:00:00 2001 From: Huacai Chen <chenhc@lemote.com> Date: Sun, 7 Apr 2013 02:14:14 +0000 Subject: PM / reboot: call syscore_shutdown() after disable_nonboot_cpus() As commit 40dc166c (PM / Core: Introduce struct syscore_ops for core subsystems PM) say, syscore_ops operations should be carried with one CPU on-line and interrupts disabled. However, after commit f96972f2d (kernel/sys.c: call disable_nonboot_cpus() in kernel_restart()), syscore_shutdown() is called before disable_nonboot_cpus(), so break the rules. We have a MIPS machine with a 8259A PIC, and there is an external timer (HPET) linked at 8259A. Since 8259A has been shutdown too early (by syscore_shutdown()), disable_nonboot_cpus() runs without timer interrupt, so it hangs and reboot fails. This patch call syscore_shutdown() a little later (after disable_nonboot_cpus()) to avoid reboot failure, this is the same way as poweroff does. For consistency, add disable_nonboot_cpus() to kernel_halt(). Signed-off-by: Huacai Chen <chenhc@lemote.com> Cc: <stable@vger.kernel.org> Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 39c9c4a2949f..0da73cf73e60 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd) system_state = SYSTEM_RESTART; usermodehelper_disable(); device_shutdown(); - syscore_shutdown(); } /** @@ -370,6 +369,7 @@ void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); disable_nonboot_cpus(); + syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); + disable_nonboot_cpus(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); -- cgit v1.2.3 From 9607a869ee59594f3f7b9f3ac43a11d92bf3f960 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Mon, 8 Apr 2013 12:06:44 +0800 Subject: kernel: tracing: Use strlcpy instead of strncpy Use strlcpy() instead of strncpy() as it will always add a '\0' to the end of the string even if the buffer is smaller than what is being copied. Link: http://lkml.kernel.org/r/51624254.30301@asianux.com Signed-off-by: Chen Gang <gang.chen@asianux.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 25770824598f..548a1f7ea2c1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3496,14 +3496,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 829b2bee24e8..07860b995752 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -125,7 +125,7 @@ static bool allocate_snapshot; static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ ring_buffer_expanded = true; @@ -164,7 +164,7 @@ static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { - strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); trace_boot_options = trace_boot_options_buf; return 0; } -- cgit v1.2.3 From cece95dfe5aa56ba99e51b4746230ff0b8542abd Mon Sep 17 00:00:00 2001 From: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Date: Tue, 9 Apr 2013 14:29:11 +0800 Subject: workqueue: use kmem_cache_free() instead of kfree() memory allocated by kmem_cache_alloc() should be freed using kmem_cache_free(), not kfree(). Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c273376ff73e..154aa12af48e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3750,7 +3750,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) if (pwq) { put_unbound_pool(pwq->pool); - kfree(pwq); + kmem_cache_free(pwq_cache, pwq); } } -- cgit v1.2.3 From 83e03b3fe4daffdebbb42151d5410d730ae50bd1 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Mon, 1 Apr 2013 21:46:23 +0900 Subject: tracing: Fix double free when function profile init failed On the failure path, stat->start and stat->pages will refer same page. So it'll attempt to free the same page again and get kernel panic. Link: http://lkml.kernel.org/r/1364820385-32027-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7e897106b7e0..926ebfb74936 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) free_page(tmp); } - free_page((unsigned long)stat->pages); stat->pages = NULL; stat->start = NULL; -- cgit v1.2.3 From 39e30cd1537937d3c00ef87e865324e981434e5b Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Mon, 1 Apr 2013 21:46:24 +0900 Subject: tracing: Fix off-by-one on allocating stat->pages The first page was allocated separately, so no need to start from 0. Link: http://lkml.kernel.org/r/1364820385-32027-2-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 548a1f7ea2c1..c9f31491009f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); - for (i = 0; i < pages; i++) { + for (i = 1; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); if (!pg->next) goto out_free; -- cgit v1.2.3 From 4e2dcb73aecbde9fe4e3137c9ea35cb6aa6cb286 Mon Sep 17 00:00:00 2001 From: Zhang Hang <bob.zhanghang@huawei.com> Date: Wed, 10 Apr 2013 14:04:55 +0800 Subject: sched: Simplify can_migrate_task() At this point tsk_cache_hot is always true, so no need to check it. Signed-off-by: Zhang Hang <bob.zhanghang@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51650107.9040606@huawei.com [ Also remove unnecessary schedstat #ifdefs. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 539760ef00c4..bf8ab4f5e603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3921,20 +3921,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* -- cgit v1.2.3 From b9b0853a4b377f84a5e6ed091816a9a2d6b10918 Mon Sep 17 00:00:00 2001 From: Libin <huawei.libin@huawei.com> Date: Mon, 1 Apr 2013 19:14:01 +0800 Subject: sched: Fix comment in rebalance_domains() A comment in function rebalance_domains() mentions arch_init_sched_domains(), but that function does not exist anymore. The proper function is init_sched_domains(). Signed-off-by: Libin <huawei.libin@huawei.com> Cc: <peterz@infradead.org> Link: http://lkml.kernel.org/r/1364814841-49156-1-git-send-email-huawei.libin@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf8ab4f5e603..155783b4e4bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5466,7 +5466,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { -- cgit v1.2.3 From 2e76c24d72372db35f226a49c2b99d0fd8cfd400 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:36:31 +0800 Subject: sched: Split cpuacct code out of core.c Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/5155366F.5060404@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/Makefile | 1 + kernel/sched/core.c | 220 ----------------------------------------------- kernel/sched/cpuacct.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 220 deletions(-) create mode 100644 kernel/sched/cpuacct.c (limited to 'kernel') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5e1aa5b2684..c28222f72c80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8043,226 +8043,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..50ec24b6193d --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,227 @@ +#include <linux/cgroup.h> +#include <linux/slab.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/cpumask.h> +#include <linux/seq_file.h> +#include <linux/rcupdate.h> +#include <linux/kernel_stat.h> + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +struct cpuacct root_cpuacct; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + if (unlikely(!cpuacct_subsys.active)) + return; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + for (; ca; ca = parent_ca(ca)) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + } + + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, +}; -- cgit v1.2.3 From 60fed7891d4115be0ed7ff9ce6851eda80533c64 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:36:43 +0800 Subject: sched: Split cpuacct code out of sched.h Add cpuacct.h and let sched.h include it. Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/5155367B.2060506@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 48 +--------------------------------------------- 2 files changed, 53 insertions(+), 47 deletions(-) create mode 100644 kernel/sched/cpuacct.h (limited to 'kernel') diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..a7f3d4a8f535 --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,52 @@ +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +#ifdef CONFIG_CGROUP_CPUACCT + +#include <linux/cgroup.h> +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +extern struct cgroup_subsys cpuacct_subsys; +extern struct cpuacct root_cpuacct; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca || !ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3bd15a43eebc..8116cf8e350f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -7,6 +7,7 @@ #include <linux/stop_machine.h> #include "cpupri.h" +#include "cpuacct.h" extern __read_mostly int scheduler_running; @@ -950,14 +951,6 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - #define ENQUEUE_WAKEUP 1 #define ENQUEUE_HEAD 2 #ifdef CONFIG_SMP @@ -1054,45 +1047,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { -- cgit v1.2.3 From dbe4b41f9800223949ce72e4289814697e0ea91a Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:36:55 +0800 Subject: sched/cpuacct: Add cpuacct_init() So we don't open-coded initialization of cpuacct in core.c. Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51553687.1060906@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 8 ++------ kernel/sched/cpuacct.c | 7 +++++++ kernel/sched/cpuacct.h | 5 +++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c28222f72c80..92930a89529d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6936,12 +6936,8 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif + cpuacct_init(); + for_each_possible_cpu(i) { struct rq *rq; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 50ec24b6193d..48b5e9184dcc 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -218,6 +218,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +void __init cpuacct_init(void) +{ + root_cpuacct.cpustat = &kernel_cpustat; + root_cpuacct.cpuusage = alloc_percpu(u64); + BUG_ON(!root_cpuacct.cpuusage); /* Too early, not expected to fail */ +} + struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .css_alloc = cpuacct_css_alloc, diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index a7f3d4a8f535..551acd729562 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -41,10 +41,15 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return cgroup_ca(ca->css.cgroup->parent); } +extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); #else +static inline void cpuacct_init(void) +{ +} + static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } -- cgit v1.2.3 From 1966aaf7d54608e8ddb7ac454b461840e763409a Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:37:06 +0800 Subject: sched/cpuacct: Add cpuacct_acount_field() So we can remove open-coded cpuacct code in cputime.c. Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51553692.9060008@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 23 +++++++++++++++++++++++ kernel/sched/cpuacct.h | 6 ++++++ kernel/sched/cputime.c | 18 +----------------- 3 files changed, 30 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 48b5e9184dcc..72bd971ea377 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -218,6 +218,29 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = parent_ca(ca); + } + rcu_read_unlock(); +} + void __init cpuacct_init(void) { root_cpuacct.cpustat = &kernel_cpustat; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 551acd729562..bd0409b85525 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -43,6 +43,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); #else @@ -54,4 +55,9 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 699d59756ece..33508dc78d0c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) static inline void task_group_account_field(struct task_struct *p, int index, u64 tmp) { -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup @@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif + cpuacct_account_field(p, index, tmp); } /* -- cgit v1.2.3 From 543bc0e76e6bb84300eaf9833edc5a481f788678 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:37:29 +0800 Subject: sched/cpuacct: Remove redundant NULL checks in cpuacct_charge() This is a micro optimization for the hot path. - We don't need to check if @ca is NULL in parent_ca(). - We don't need to check if @ca is NULL in the beginning of the for loop. Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/515536A9.5000700@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 6 +++++- kernel/sched/cpuacct.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 72bd971ea377..b2aaaba16d46 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -210,9 +210,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); - for (; ca; ca = parent_ca(ca)) { + while (true) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; } rcu_read_unlock(); diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index bd0409b85525..45c168207fcc 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -36,7 +36,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca || !ca->css.cgroup->parent) + if (!ca->css.cgroup->parent) return NULL; return cgroup_ca(ca->css.cgroup->parent); } -- cgit v1.2.3 From 5f40d804325e925409907e29f46ecb012090b6c2 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:37:43 +0800 Subject: sched/cpuacct: Remove redundant NULL checks in cpuacct_acount_field() This is a micro optimazation for a hot path. - We don't need to check if @ca returned from task_ca() is NULL. - We don't need to check if @ca returned from parent_ca() is NULL. Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/515536B7.6060602@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 4 ++-- kernel/sched/cpuacct.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index b2aaaba16d46..071ae8d08181 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -237,10 +237,10 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_lock(); ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { + while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = parent_ca(ca); + ca = __parent_ca(ca); } rcu_read_unlock(); } diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 45c168207fcc..b2f79ad1e524 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -34,6 +34,11 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) struct cpuacct, css); } +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + static inline struct cpuacct *parent_ca(struct cpuacct *ca) { if (!ca->css.cgroup->parent) -- cgit v1.2.3 From d1712796a880bea0a44739941116001923f3275b Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:38:13 +0800 Subject: sched/cpuacct: Clean up cpuacct.h Now most of the code in cpuacct.h can be moved to cpuacct.c Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/515536D5.2080401@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 44 +++++++++++++++++++++++++++++++++++++++++++- kernel/sched/cpuacct.h | 46 ---------------------------------------------- 2 files changed, 43 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 071ae8d08181..9305fd2f8cf9 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -16,7 +16,49 @@ * (balbir@in.ibm.com). */ -struct cpuacct root_cpuacct; +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static struct cpuacct root_cpuacct; /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index b2f79ad1e524..51cd76eb4f0f 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,51 +1,5 @@ -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - #ifdef CONFIG_CGROUP_CPUACCT -#include <linux/cgroup.h> -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); -- cgit v1.2.3 From 7943e15a3e91db78a7a3fbc84e45cf9d1c7c7d23 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:43:46 +0800 Subject: sched/cpuacct: Allocate per_cpu cpuusage for root cpuacct statically This is a preparation, so later we can initialize cpuacct earlier. Signed-off-by: Li Zefan <lizefan@huawei.com> Cc: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51553822.5000403@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9305fd2f8cf9..a691c4dd65be 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -58,6 +58,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return cgroup_ca(ca->css.cgroup->parent); } +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct; /* create a new cpu accounting group */ @@ -290,8 +291,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) void __init cpuacct_init(void) { root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - BUG_ON(!root_cpuacct.cpuusage); /* Too early, not expected to fail */ + root_cpuacct.cpuusage = &root_cpuacct_cpuusage; } struct cgroup_subsys cpuacct_subsys = { -- cgit v1.2.3 From 14c6d3c8a47ced185b6375c4940b5b393f1a294e Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:44:04 +0800 Subject: sched/cpuacct: Initialize root cpuacct earlier Now we don't need cpuacct_init(), and instead we just initialize root_cpuacct when it's defined. Signed-off-by: Li Zefan <lizefan@huawei.com> Cc: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/51553834.9090701@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 2 -- kernel/sched/cpuacct.c | 11 ++++------- kernel/sched/cpuacct.h | 5 ----- 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 92930a89529d..ee8c1bd703fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6936,8 +6936,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ - cpuacct_init(); - for_each_possible_cpu(i) { struct rq *rq; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index a691c4dd65be..04255814a0ed 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -59,7 +59,10 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); -static struct cpuacct root_cpuacct; +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) @@ -288,12 +291,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -void __init cpuacct_init(void) -{ - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = &root_cpuacct_cpuusage; -} - struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .css_alloc = cpuacct_css_alloc, diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 51cd76eb4f0f..ed605624a5e7 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,15 +1,10 @@ #ifdef CONFIG_CGROUP_CPUACCT -extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); #else -static inline void cpuacct_init(void) -{ -} - static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } -- cgit v1.2.3 From 621e2de02403a6f776852c564b79c38bf3cc9032 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:44:15 +0800 Subject: sched/cpuacct: Initialize cpuacct subsystem earlier Initialize cpuacct before the scheduler is functioning, so when cpuacct_charge() and cpuacct_account_field() are called, task_ca() won't return NULL. Signed-off-by: Li Zefan <lizefan@huawei.com> Cc: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/5155383F.8000005@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 04255814a0ed..75e46d291f9c 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -292,9 +292,10 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) } struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, }; -- cgit v1.2.3 From a2b0ae25fc8bfeeb4022b8e847ab811b3c8368d1 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:44:28 +0800 Subject: sched/cpuacct: No need to check subsys active state Now we're guaranteed when cpuacct_charge() and cpuacct_account_field() are called, cpuacct has already been properly initialized, so we no longer need those checks. Signed-off-by: Li Zefan <lizefan@huawei.com> Cc: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/5155384C.7000508@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 75e46d291f9c..ef57ab658722 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -247,9 +247,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cpuacct *ca; int cpu; - if (unlikely(!cpuacct_subsys.active)) - return; - cpu = task_cpu(tsk); rcu_read_lock(); @@ -278,9 +275,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct kernel_cpustat *kcpustat; struct cpuacct *ca; - if (unlikely(!cpuacct_subsys.active)) - return; - rcu_read_lock(); ca = task_ca(p); while (ca != &root_cpuacct) { -- cgit v1.2.3 From 479f614110b889d5783acdaec865ede3cdb96b97 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 29 Mar 2013 14:44:42 +0800 Subject: cgroup: Kill subsys.active flag The only user was cpuacct. Acked-by: Tejun Heo <tj@kernel.org> Signed-off-by: Li Zefan <lizefan@huawei.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/5155385A.4040207@huawei.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 3 --- 2 files changed, 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 900af5964f55..95c02c0e35b3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -458,7 +458,6 @@ struct cgroup_subsys { void (*bind)(struct cgroup *root); int subsys_id; - int active; int disabled; int early_init; /* diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..5c462813722d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4468,7 +4468,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - ss->active = 1; BUG_ON(online_css(ss, dummytop)); mutex_unlock(&cgroup_mutex); @@ -4573,7 +4572,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ss->active = 1; ret = online_css(ss, dummytop); if (ret) goto err_unload; @@ -4614,7 +4612,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); offline_css(ss, dummytop); - ss->active = 0; if (ss->use_id) idr_destroy(&ss->idr); -- cgit v1.2.3 From 8184004ed7a0bc9538f5e825615c29fc52466bab Mon Sep 17 00:00:00 2001 From: Sasha Levin <sasha.levin@oracle.com> Date: Wed, 20 Mar 2013 11:58:25 -0400 Subject: locking/rtmutex/tester: Set correct permissions on sysfs files sysfs started complaining about cases where permissions don't match what's in the sysfs ops structure (such as allowing read without a "show" callback). Signed-off-by: Sasha Levin <sasha.levin@oracle.com> Cc: williams@redhat.com Link: http://lkml.kernel.org/r/1363795105-5884-1-git-send-email-sasha.levin@oracle.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/rtmutex-tester.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 7890b10084a7..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/stat.h> #include "rtmutex.h" @@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at return curr - buf; } -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); static struct bus_type rttest_subsys = { .name = "rttest", -- cgit v1.2.3 From b329fd5b018ffd64cfef6a2551bb2ca4bbfbacf2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar <mingo@kernel.org> Date: Wed, 10 Apr 2013 15:10:50 +0200 Subject: sched/cpuacct/UML: Fix header file dependency bug on the UML build The cpuacct split caused this build failure on UML: kernel/sched/cpuacct.c:94:2: error: implicit declaration of function 'ERR_PTR' Cc: Li Zefan <lizefan@huawei.com> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/cpuacct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index ef57ab658722..dbb7e2cd95eb 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,7 @@ #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <linux/kernel_stat.h> +#include <linux/err.h> #include "sched.h" -- cgit v1.2.3 From 84cfb6ab484b442d5115eb3baf9db7d74a3ea626 Mon Sep 17 00:00:00 2001 From: Rami Rosen <ramirose@gmail.com> Date: Wed, 10 Apr 2013 14:41:17 +0300 Subject: cgroup: remove bind() method from cgroup_subsys. The bind() method of cgroup_subsys is not used in any of the controllers (cpuset, freezer, blkio, net_cls, memcg, net_prio, devices, perf, hugetlb, cpu and cpuacct) tj: Removed the entry on ->bind() from Documentation/cgroups/cgroups.txt. Also updated a couple paragraphs which were suggesting that dynamic re-binding may be implemented. It's not gonna. Signed-off-by: Rami Rosen <ramirose@gmail.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- Documentation/cgroups/cgroups.txt | 20 +++++--------------- include/linux/cgroup.h | 2 -- kernel/cgroup.c | 4 ---- 3 files changed, 5 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 638bf17ff869..2b51e12ce178 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -211,10 +211,9 @@ matches, and any of the requested subsystems are in use in an existing hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy is activated, associated with the requested subsystems. -It's not currently possible to bind a new subsystem to an active -cgroup hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. This may be possible in future, but is fraught with nasty -error-recovery issues. +It's not possible to bind a new subsystem to an active cgroup +hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. When a cgroup filesystem is unmounted, if there are any child cgroups created below the top-level cgroup, that hierarchy @@ -382,10 +381,8 @@ To Specify a hierarchy's release_agent: Note that specifying 'release_agent' more than once will return failure. -Note that changing the set of subsystems is currently only supported -when the hierarchy consists of a single (root) cgroup. Supporting -the ability to arbitrarily bind/unbind subsystems from an existing -cgroup hierarchy is intended to be implemented in the future. +Note that changing the set of subsystems is only supported when the +hierarchy consists of a single (root) cgroup. Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 @@ -643,13 +640,6 @@ void exit(struct task_struct *task) Called during task exit. -void bind(struct cgroup *root) -(cgroup_mutex held by caller) - -Called when a cgroup subsystem is rebound to a different hierarchy -and root cgroup. Currently this will only involve movement between -the default hierarchy (which never has sub-cgroups) and a hierarchy -that is being created/destroyed (and hence has no sub-cgroups). 4. Extended attribute usage =========================== diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 515927eebb37..92acf8601ae0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -483,8 +483,6 @@ struct cgroup_subsys { void (*fork)(struct task_struct *task); void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task); - void (*bind)(struct cgroup *root); - int subsys_id; int active; int disabled; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ba3e24a76dae..fd38e1cfacca 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1064,16 +1064,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; - if (ss->bind) - ss->bind(cgrp); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); - if (ss->bind) - ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; -- cgit v1.2.3 From 415cf07a1c1c65249773330434878ae7bcd92d0f Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Mon, 8 Apr 2013 14:35:02 +0800 Subject: cgroup: make sure parent won't be destroyed before its children Suppose we rmdir a cgroup and there're still css refs, this cgroup won't be freed. Then we rmdir the parent cgroup, and the parent is freed immediately due to css ref draining to 0. Now it would be a disaster if the still-alive child cgroup tries to access its parent. Make sure this won't happen. Signed-off-by: Li Zefan <lizefan@huawei.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index fd38e1cfacca..65b72d0367c5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -875,6 +875,13 @@ static void cgroup_free_fn(struct work_struct *work) cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); + /* + * We get a ref to the parent's dentry, and put the ref when + * this cgroup is being freed, so it's guaranteed that the + * parent won't be destroyed before its children. + */ + dput(cgrp->parent->dentry); + /* * Drop the active superblock reference that we took when we * created the cgroup @@ -4164,6 +4171,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) dget(dentry); + /* hold a ref to the parent's dentry */ + dget(parent->dentry); + /* creation succeeded, notify subsystems */ for_each_subsys(root, ss) { err = online_css(ss, cgrp); -- cgit v1.2.3 From 78574cf981cd3d9ae9f6adbd466a772310ec24ff Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Mon, 8 Apr 2013 19:00:38 -0700 Subject: cgroup: implement cgroup_is_descendant() A couple controllers want to determine whether two cgroups are in ancestor/descendant relationship. As it's more likely that the descendant is the primary subject of interest and there are other operations focusing on the descendants, let's ask is_descendent rather than is_ancestor. Implementation is trivial as the previous patch guarantees that all ancestors of a cgroup stay accessible as long as the cgroup is accessible. tj: Removed depth optimization, renamed from cgroup_is_ancestor(), rewrote descriptions. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 92acf8601ae0..a2b9d4b13369 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -439,6 +439,7 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_is_removed(const struct cgroup *cgrp); +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 65b72d0367c5..7bf3ce09c50c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -276,6 +276,26 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) return test_bit(CGRP_REMOVED, &cgrp->flags); } +/** + * cgroup_is_descendant - test ancestry + * @cgrp: the cgroup to be tested + * @ancestor: possible ancestor of @cgrp + * + * Test whether @cgrp is a descendant of @ancestor. It also returns %true + * if @cgrp == @ancestor. This function is safe to call as long as @cgrp + * and @ancestor are accessible. + */ +bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) +{ + while (cgrp) { + if (cgrp == ancestor) + return true; + cgrp = cgrp->parent; + } + return false; +} +EXPORT_SYMBOL_GPL(cgroup_is_descendant); + /* bits in struct cgroupfs_root flags field */ enum { ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ -- cgit v1.2.3 From ef824fa129b7579f56b92d466ecda2e378879806 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Mon, 8 Apr 2013 19:00:38 -0700 Subject: perf: make perf_event cgroup hierarchical perf_event is one of a couple remaining cgroup controllers with broken hierarchy support. Converting it to support hierarchy is almost trivial. The only thing necessary is to consider a task belonging to a descendant cgroup as a match. IOW, if the cgroup of the currently executing task (@cpuctx->cgrp) equals or is a descendant of the event's cgroup (@event->cgrp), then the event should be enabled. Implement hierarchy support and remove .broken_hierarchy tag along with the incorrect comment on what needs to be done for hierarchy support. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Paul Mackerras <paulus@samba.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Cc: Stephane Eranian <eranian@google.com> Cc: Namhyung Kim <namhyung.kim@lge.com> --- kernel/events/core.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b0cd86501c30..310ec19d968a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -251,7 +251,22 @@ perf_cgroup_match(struct perf_event *event) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - return !event->cgrp || event->cgrp == cpuctx->cgrp; + /* @event doesn't care about cgroup */ + if (!event->cgrp) + return true; + + /* wants specific cgroup scope but @cpuctx isn't associated with any */ + if (!cpuctx->cgrp) + return false; + + /* + * Cgroup scoping is recursive. An event enabled for a cgroup is + * also enabled for all its descendant cgroups. If @cpuctx's + * cgroup is a descendant of @event's (the test covers identity + * case), it's a match. + */ + return cgroup_is_descendant(cpuctx->cgrp->css.cgroup, + event->cgrp->css.cgroup); } static inline bool perf_tryget_cgroup(struct perf_event *event) @@ -7509,12 +7524,5 @@ struct cgroup_subsys perf_subsys = { .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, - - /* - * perf_event cgroup doesn't handle nesting correctly. - * ctx->nr_cgroups adjustments should be propagated through the - * cgroup hierarchy. Fix it and remove the following. - */ - .broken_hierarchy = true, }; #endif /* CONFIG_CGROUP_PERF */ -- cgit v1.2.3 From c481420248c6730246d2a1b1773d5d7007ae0835 Mon Sep 17 00:00:00 2001 From: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Date: Fri, 12 Apr 2013 11:05:54 +0800 Subject: perf: Fix error return code Fix to return -ENOMEM in the allocation error case instead of 0 (if pmu_bus_running == 1), as done elsewhere in this function. Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn> Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/CAPgLHd8j_fWcgqe%3DKLWjpBj%2B%3Do0Pw6Z-SEq%3DNTPU08c2w1tngQ@mail.gmail.com [ Tweaked the error code setting placement and the changelog. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7f0d67ea3f48..7e0962ed7f8a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5987,6 +5987,7 @@ skip_type: if (pmu->pmu_cpu_context) goto got_cpu_context; + ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; -- cgit v1.2.3 From f2530dc71cf0822f90bb63ea4600caaef33a66bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Tue, 9 Apr 2013 09:33:34 +0200 Subject: kthread: Prevent unpark race which puts threads on the wrong cpu The smpboot threads rely on the park/unpark mechanism which binds per cpu threads on a particular core. Though the functionality is racy: CPU0 CPU1 CPU2 unpark(T) wake_up_process(T) clear(SHOULD_PARK) T runs leave parkme() due to !SHOULD_PARK bind_to(CPU2) BUG_ON(wrong CPU) We cannot let the tasks move themself to the target CPU as one of those tasks is actually the migration thread itself, which requires that it starts running on the target cpu right away. The solution to this problem is to prevent wakeups in park mode which are not from unpark(). That way we can guarantee that the association of the task to the target cpu is working correctly. Add a new task state (TASK_PARKED) which prevents other wakeups and use this state explicitly for the unpark wakeup. Peter noticed: Also, since the task state is visible to userspace and all the parked tasks are still in the PID space, its a good hint in ps and friends that these tasks aren't really there for the moment. The migration thread has another related issue. CPU0 CPU1 Bring up CPU2 create_thread(T) park(T) wait_for_completion() parkme() complete() sched_set_stop_task() schedule(TASK_PARKED) The sched_set_stop_task() call is issued while the task is on the runqueue of CPU1 and that confuses the hell out of the stop_task class on that cpu. So we need the same synchronizaion before sched_set_stop_task(). Reported-by: Dave Jones <davej@redhat.com> Reported-and-tested-by: Dave Hansen <dave@sr71.net> Reported-and-tested-by: Borislav Petkov <bp@alien8.de> Acked-by: Peter Ziljstra <peterz@infradead.org> Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com> Cc: dhillf@gmail.com Cc: Ingo Molnar <mingo@kernel.org> Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- fs/proc/array.c | 1 + include/linux/sched.h | 5 +++-- include/trace/events/sched.h | 2 +- kernel/kthread.c | 52 ++++++++++++++++++++++++-------------------- kernel/smpboot.c | 14 ++++++++++-- 5 files changed, 45 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/fs/proc/array.c b/fs/proc/array.c index f7ed9ee46eb9..cbd0f1b324b9 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -143,6 +143,7 @@ static const char * const task_state_array[] = { "x (dead)", /* 64 */ "K (wakekill)", /* 128 */ "W (waking)", /* 256 */ + "P (parked)", /* 512 */ }; static inline const char *get_task_state(struct task_struct *tsk) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..e692a022527b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 -#define TASK_STATE_MAX 512 +#define TASK_PARKED 512 +#define TASK_STATE_MAX 1024 -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW" +#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5a8671e8a67f..e5586caff67a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -147,7 +147,7 @@ TRACE_EVENT(sched_switch, __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, - { 128, "W" }) : "R", + { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", __entry->prev_state & TASK_STATE_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..9eb7fed0bbaa 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) complete(&self->parked); schedule(); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); } clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); @@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, state)) { + WARN_ON(1); + return; + } /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); p->flags |= PF_THREAD_BOUND; @@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - __kthread_bind(p, cpu); + __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); @@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k) return NULL; } +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) +{ + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + wake_up_state(k, TASK_PARKED); + } +} + /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). @@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = task_get_live_kthread(k); - if (kthread) { - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. - */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu); - wake_up_process(k); - } - } + if (kthread) + __kthread_unpark(k, kthread); put_task_struct(k); } @@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k) trace_sched_kthread_stop(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); } diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 8eaed9aa9cf0..02fc5c933673 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) } get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; - if (ht->create) - ht->create(cpu); + if (ht->create) { + /* + * Make sure that the task has actually scheduled out + * into park position, before calling the create + * callback. At least the migration thread callback + * requires that the task is off the runqueue. + */ + if (!wait_task_inactive(tsk, TASK_PARKED)) + WARN_ON(1); + else + ht->create(cpu); + } return 0; } -- cgit v1.2.3 From 26d5bbe5ba2073fc7ef9e69a55543b2376f5bad0 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Fri, 12 Apr 2013 10:29:04 -0700 Subject: Revert "cgroup: remove bind() method from cgroup_subsys." This reverts commit 84cfb6ab484b442d5115eb3baf9db7d74a3ea626. There are scheduled changes which make use of the removed callback. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Rami Rosen <ramirose@gmail.com> Cc: Li Zefan <lizefan@huawei.com> --- Documentation/cgroups/cgroups.txt | 20 +++++++++++++++----- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 4 ++++ 3 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 2b51e12ce178..638bf17ff869 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -211,9 +211,10 @@ matches, and any of the requested subsystems are in use in an existing hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy is activated, associated with the requested subsystems. -It's not possible to bind a new subsystem to an active cgroup -hierarchy, or to unbind a subsystem from an active cgroup -hierarchy. +It's not currently possible to bind a new subsystem to an active +cgroup hierarchy, or to unbind a subsystem from an active cgroup +hierarchy. This may be possible in future, but is fraught with nasty +error-recovery issues. When a cgroup filesystem is unmounted, if there are any child cgroups created below the top-level cgroup, that hierarchy @@ -381,8 +382,10 @@ To Specify a hierarchy's release_agent: Note that specifying 'release_agent' more than once will return failure. -Note that changing the set of subsystems is only supported when the -hierarchy consists of a single (root) cgroup. +Note that changing the set of subsystems is currently only supported +when the hierarchy consists of a single (root) cgroup. Supporting +the ability to arbitrarily bind/unbind subsystems from an existing +cgroup hierarchy is intended to be implemented in the future. Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1 @@ -640,6 +643,13 @@ void exit(struct task_struct *task) Called during task exit. +void bind(struct cgroup *root) +(cgroup_mutex held by caller) + +Called when a cgroup subsystem is rebound to a different hierarchy +and root cgroup. Currently this will only involve movement between +the default hierarchy (which never has sub-cgroups) and a hierarchy +that is being created/destroyed (and hence has no sub-cgroups). 4. Extended attribute usage =========================== diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a2b9d4b13369..45aee0fc6b98 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -484,6 +484,8 @@ struct cgroup_subsys { void (*fork)(struct task_struct *task); void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task); + void (*bind)(struct cgroup *root); + int subsys_id; int active; int disabled; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7bf3ce09c50c..678a22c75fdb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1091,12 +1091,16 @@ static int rebind_subsystems(struct cgroupfs_root *root, cgrp->subsys[i]->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; + if (ss->bind) + ss->bind(cgrp); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_mask) { /* We're removing this subsystem */ BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + if (ss->bind) + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; -- cgit v1.2.3 From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Thu, 11 Apr 2013 15:55:01 +0900 Subject: tracing: Fix possible NULL pointer dereferences Currently set_ftrace_pid and set_graph_function files use seq_lseek for their fops. However seq_open() is called only for FMODE_READ in the fops->open() so that if an user tries to seek one of those file when she open it for writing, it sees NULL seq_file and then panic. It can be easily reproduced with following command: $ cd /sys/kernel/debug/tracing $ echo 1234 | sudo tee -a set_ftrace_pid In this example, GNU coreutils' tee opens the file with fopen(, "a") and then the fopen() internally calls lseek(). Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace.h | 2 +- kernel/trace/ftrace.c | 10 +++++----- kernel/trace/trace_stack.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 167abf907802..eb3ce327b975 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -396,7 +396,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence); +loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 926ebfb74936..affc35d829cc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file) } loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) { loff_t ret; @@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, + .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, - .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_pid_release, }; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc701..83a8b5b7bd35 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; -- cgit v1.2.3 From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org> Date: Fri, 12 Apr 2013 16:40:13 -0400 Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the ftrace_pid_fops is defined when DYNAMIC_FTRACE is not. Cc: stable@vger.kernel.org Cc: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- include/linux/ftrace.h | 3 ++- kernel/trace/ftrace.c | 28 ++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index eb3ce327b975..52da2a250795 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -396,7 +396,6 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos); -loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); int ftrace_regex_release(struct inode *inode, struct file *file); void __init @@ -569,6 +568,8 @@ static inline int ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; } #endif /* CONFIG_DYNAMIC_FTRACE */ +loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence); + /* totally disable ftrace - can not re-enable after this */ void ftrace_kill(void); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index affc35d829cc..2461ede45a8d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, whence); + else + file->f_pos = ret = 1; + + return ret; +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -loff_t -ftrace_filter_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; -- cgit v1.2.3 From 9f50afccfdc15d95d7331acddcb0f7703df089ae Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Thu, 11 Apr 2013 16:01:38 +0900 Subject: tracing: Reset ftrace_graph_filter_enabled if count is zero The ftrace_graph_count can be decreased with a "!" pattern, so that the enabled flag should be updated too. Link: http://lkml.kernel.org/r/1365663698-2413-1-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c9f31491009f..9e3198782507 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3792,7 +3792,8 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = 1; + ftrace_graph_filter_enabled = !!(*idx); + return 0; } -- cgit v1.2.3 From f1943977e6648c1d42a78eda4ba4429a2bc0b786 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Wed, 10 Apr 2013 09:18:11 +0900 Subject: tracing: Get rid of unneeded key calculation in ftrace_hash_move() It's not used anywhere in the function. Link: http://lkml.kernel.org/r/1365553093-10180-1-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9e3198782507..3b84fc100788 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1320,7 +1320,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct hlist_head *hhd; struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; - unsigned long key; int size = src->count; int bits = 0; int ret; @@ -1363,10 +1362,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, for (i = 0; i < size; i++) { hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { - if (bits > 0) - key = hash_long(entry->ip, bits); - else - key = 0; remove_hash_entry(src, entry); __add_hash_entry(new_hash, entry); } -- cgit v1.2.3 From ed6f1c996bfe4b6e520cf7a74b51cd6988d84420 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Wed, 10 Apr 2013 09:18:12 +0900 Subject: tracing: Check return value of tracing_init_dentry() Check return value and bail out if it's NULL. Link: http://lkml.kernel.org/r/1365553093-10180-2-git-send-email-namhyung@kernel.org Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Namhyung Kim <namhyung.kim@lge.com> Cc: stable@vger.kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/trace.c | 2 ++ kernel/trace/trace_stack.c | 2 ++ kernel/trace/trace_stat.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 07860b995752..72970793b40a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5953,6 +5953,8 @@ static __init int tracer_init_debugfs(void) trace_access_lock_init(); d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; init_tracer_debugfs(&global_trace, d_tracer); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index aab277b67fa9..8c3f37e2dc43 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -431,6 +431,8 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; trace_create_file("stack_max_size", 0644, d_tracer, &max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e73..847f88a6194b 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -307,6 +307,8 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); + if (!d_tracing) + return 0; stat_dir = debugfs_create_dir("trace_stat", d_tracing); if (!stat_dir) -- cgit v1.2.3 From 20079ebe73c16b34621abd2993f3d48e2f9336b7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim <namhyung.kim@lge.com> Date: Wed, 10 Apr 2013 08:55:50 +0900 Subject: ftrace: Get rid of ftrace_profile_bits It seems that function profiler's hash size is fixed at 1024. Add and use FTRACE_PROFILE_HASH_BITS instead and update hash size macro. Link: http://lkml.kernel.org/r/1365551750-4504-1-git-send-email-namhyung@kernel.org Signed-off-by: Namhyung Kim <namhyung@kernel.org> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/trace/ftrace.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3b84fc100788..9b44abb2c5a0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -486,7 +486,6 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly; static int ftrace_profile_enabled __read_mostly; /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) static void * function_stat_next(void *v, int idx) @@ -725,13 +725,6 @@ static int ftrace_profile_init_cpu(int cpu) if (!stat->hash) return -ENOMEM; - if (!ftrace_profile_bits) { - size--; - - for (; size; size >>= 1) - ftrace_profile_bits++; - } - /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); @@ -765,7 +758,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) struct hlist_node *n; unsigned long key; - key = hash_long(ip, ftrace_profile_bits); + key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); hhd = &stat->hash[key]; if (hlist_empty(hhd)) @@ -784,7 +777,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, { unsigned long key; - key = hash_long(rec->ip, ftrace_profile_bits); + key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); hlist_add_head_rcu(&rec->node, &stat->hash[key]); } -- cgit v1.2.3 From ea024870cf10687b3fded66a9deb6253888f30b7 Mon Sep 17 00:00:00 2001 From: Anton Arapov <anton@redhat.com> Date: Wed, 3 Apr 2013 18:00:31 +0200 Subject: uretprobes: Introduce uprobe_consumer->ret_handler() Enclose return probes implementation, introduce ->ret_handler() and update existing code to rely on ->handler() *and* ->ret_handler() for uprobe and uretprobe respectively. Signed-off-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 17 ++++++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 19612881399a..5c8d3290df41 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -46,6 +46,9 @@ enum uprobe_filter_ctx { struct uprobe_consumer { int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs); + int (*ret_handler)(struct uprobe_consumer *self, + unsigned long func, + struct pt_regs *regs); bool (*filter)(struct uprobe_consumer *self, enum uprobe_filter_ctx ctx, struct mm_struct *mm); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7312503caf2e..eb384e90ac92 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -838,6 +838,14 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * struct uprobe *uprobe; int ret; + /* Uprobe must have at least one set consumer */ + if (!uc->handler && !uc->ret_handler) + return -EINVAL; + + /* TODO: Implement return probes */ + if (uc->ret_handler) + return -ENOSYS; + /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; @@ -1497,10 +1505,13 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { - int rc = uc->handler(uc, regs); + int rc = 0; - WARN(rc & ~UPROBE_HANDLER_MASK, - "bad rc=0x%x from %pf()\n", rc, uc->handler); + if (uc->handler) { + rc = uc->handler(uc, regs); + WARN(rc & ~UPROBE_HANDLER_MASK, + "bad rc=0x%x from %pf()\n", rc, uc->handler); + } remove &= rc; } -- cgit v1.2.3 From e78aebfd27256ca59ccd3e6cf62cfad2a80e02d3 Mon Sep 17 00:00:00 2001 From: Anton Arapov <anton@redhat.com> Date: Wed, 3 Apr 2013 18:00:32 +0200 Subject: uretprobes: Reserve the first slot in xol_vma for trampoline Allocate trampoline page, as the very first one in uprobed task xol area, and fill it with breakpoint opcode. Also introduce get_trampoline_vaddr() helper, to wrap the trampoline address extraction from area->vaddr. That removes confusion and eases the debug experience in case ->vaddr notion will be changed. Signed-off-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- kernel/events/uprobes.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eb384e90ac92..d345b7c6cb2d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1132,6 +1132,7 @@ static struct xol_area *get_xol_area(void) { struct mm_struct *mm = current->mm; struct xol_area *area; + uprobe_opcode_t insn = UPROBE_SWBP_INSN; area = mm->uprobes_state.xol_area; if (area) @@ -1149,7 +1150,12 @@ static struct xol_area *get_xol_area(void) if (!area->page) goto free_bitmap; + /* allocate first slot of task's xol_area for the return probes */ + set_bit(0, area->bitmap); + copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + atomic_set(&area->slot_count, 1); init_waitqueue_head(&area->wq); + if (!xol_add_vma(area)) return area; @@ -1346,6 +1352,25 @@ static struct uprobe_task *get_utask(void) return current->utask; } +/* + * Current area->vaddr notion assume the trampoline address is always + * equal area->vaddr. + * + * Returns -1 in case the xol_area is not allocated. + */ +static unsigned long get_trampoline_vaddr(void) +{ + struct xol_area *area; + unsigned long trampoline_vaddr = -1; + + area = current->mm->uprobes_state.xol_area; + smp_read_barrier_depends(); + if (area) + trampoline_vaddr = area->vaddr; + + return trampoline_vaddr; +} + /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) -- cgit v1.2.3 From 0dfd0eb8e4d72ded8b21f4fee74ba5547408cbe9 Mon Sep 17 00:00:00 2001 From: Anton Arapov <anton@redhat.com> Date: Wed, 3 Apr 2013 18:00:35 +0200 Subject: uretprobes: Return probe entry, prepare_uretprobe() When a uprobe with return probe consumer is hit, prepare_uretprobe() function is invoked. It creates return_instance, hijacks return address and replaces it with the trampoline. * Return instances are kept as stack per uprobed task. * Return instance is chained, when the original return address is trampoline's page vaddr (e.g. recursive call of the probed function). Signed-off-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- include/linux/uprobes.h | 1 + kernel/events/uprobes.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 5c8d3290df41..b0507f24eeb0 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -71,6 +71,7 @@ struct uprobe_task { enum uprobe_task_state state; struct arch_uprobe_task autask; + struct return_instance *return_instances; struct uprobe *active_uprobe; unsigned long xol_vaddr; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index d345b7c6cb2d..3798947b3b58 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -75,6 +75,15 @@ struct uprobe { struct arch_uprobe arch; }; +struct return_instance { + struct uprobe *uprobe; + unsigned long func; + unsigned long orig_ret_vaddr; /* original return address */ + bool chained; /* true, if instance is nested */ + + struct return_instance *next; /* keep as stack */ +}; + /* * valid_vma: Verify if the specified vma is an executable vma * Relax restrictions while unregistering: vm_flags might have @@ -1317,6 +1326,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; + struct return_instance *ri, *tmp; if (!utask) return; @@ -1324,6 +1334,15 @@ void uprobe_free_utask(struct task_struct *t) if (utask->active_uprobe) put_uprobe(utask->active_uprobe); + ri = utask->return_instances; + while (ri) { + tmp = ri; + ri = ri->next; + + put_uprobe(tmp->uprobe); + kfree(tmp); + } + xol_free_insn_slot(t); kfree(utask); t->utask = NULL; @@ -1371,6 +1390,65 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) +{ + struct return_instance *ri; + struct uprobe_task *utask; + unsigned long orig_ret_vaddr, trampoline_vaddr; + bool chained = false; + + if (!get_xol_area()) + return; + + utask = get_utask(); + if (!utask) + return; + + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + if (!ri) + goto fail; + + trampoline_vaddr = get_trampoline_vaddr(); + orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); + if (orig_ret_vaddr == -1) + goto fail; + + /* + * We don't want to keep trampoline address in stack, rather keep the + * original return address of first caller thru all the consequent + * instances. This also makes breakpoint unwrapping easier. + */ + if (orig_ret_vaddr == trampoline_vaddr) { + if (!utask->return_instances) { + /* + * This situation is not possible. Likely we have an + * attack from user-space. + */ + pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + goto fail; + } + + chained = true; + orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; + } + + atomic_inc(&uprobe->ref); + ri->uprobe = uprobe; + ri->func = instruction_pointer(regs); + ri->orig_ret_vaddr = orig_ret_vaddr; + ri->chained = chained; + + /* add instance to the stack */ + ri->next = utask->return_instances; + utask->return_instances = ri; + + return; + + fail: + kfree(ri); +} + /* Prepare to single-step probed instruction out of line. */ static int pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) @@ -1527,6 +1605,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) { struct uprobe_consumer *uc; int remove = UPROBE_HANDLER_REMOVE; + bool need_prep = false; /* prepare return uprobe, when needed */ down_read(&uprobe->register_rwsem); for (uc = uprobe->consumers; uc; uc = uc->next) { @@ -1537,9 +1616,16 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) WARN(rc & ~UPROBE_HANDLER_MASK, "bad rc=0x%x from %pf()\n", rc, uc->handler); } + + if (uc->ret_handler) + need_prep = true; + remove &= rc; } + if (need_prep && !remove) + prepare_uretprobe(uprobe, regs); /* put bp at return */ + if (remove && uprobe->consumers) { WARN_ON(!uprobe_is_active(uprobe)); unapply_uprobe(uprobe, current->mm); @@ -1658,7 +1744,11 @@ void uprobe_notify_resume(struct pt_regs *regs) */ int uprobe_pre_sstep_notifier(struct pt_regs *regs) { - if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) + if (!current->mm) + return 0; + + if (!test_bit(MMF_HAS_UPROBES, ¤t->mm->flags) && + (!current->utask || !current->utask->return_instances)) return 0; set_thread_flag(TIF_UPROBE); -- cgit v1.2.3 From fec8898d86ad0fb4d2161fc89885fa52da1e8b72 Mon Sep 17 00:00:00 2001 From: Anton Arapov <anton@redhat.com> Date: Wed, 3 Apr 2013 18:00:36 +0200 Subject: uretprobes: Return probe exit, invoke handlers Uretprobe handlers are invoked when the trampoline is hit, on completion the trampoline is replaced with the saved return address and the uretprobe instance deleted. TODO: handle_trampoline() assumes that ->return_instances is always valid. We should teach it to handle longjmp() which can invalidate the pending return_instance's. This is nontrivial, we will try to do this in a separate series. Signed-off-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- kernel/events/uprobes.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 3798947b3b58..65429ad2ce51 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1633,6 +1633,62 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } +static void +handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) +{ + struct uprobe *uprobe = ri->uprobe; + struct uprobe_consumer *uc; + + down_read(&uprobe->register_rwsem); + for (uc = uprobe->consumers; uc; uc = uc->next) { + if (uc->ret_handler) + uc->ret_handler(uc, ri->func, regs); + } + up_read(&uprobe->register_rwsem); +} + +static bool handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *tmp; + bool chained; + + utask = current->utask; + if (!utask) + return false; + + ri = utask->return_instances; + if (!ri) + return false; + + /* + * TODO: we should throw out return_instance's invalidated by + * longjmp(), currently we assume that the probed function always + * returns. + */ + instruction_pointer_set(regs, ri->orig_ret_vaddr); + + for (;;) { + handle_uretprobe_chain(ri, regs); + + chained = ri->chained; + put_uprobe(ri->uprobe); + + tmp = ri; + ri = ri->next; + kfree(tmp); + + if (!chained) + break; + + BUG_ON(!ri); + } + + utask->return_instances = ri; + + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1644,8 +1700,15 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - uprobe = find_active_uprobe(bp_vaddr, &is_swbp); + if (bp_vaddr == get_trampoline_vaddr()) { + if (handle_trampoline(regs)) + return; + pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", + current->pid, current->tgid); + } + + uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { if (is_swbp > 0) { /* No matching uprobe; signal SIGTRAP. */ -- cgit v1.2.3 From ded49c55309a37129dc30a5f0e85b8a64e5c1716 Mon Sep 17 00:00:00 2001 From: Anton Arapov <anton@redhat.com> Date: Wed, 3 Apr 2013 18:00:37 +0200 Subject: uretprobes: Limit the depth of return probe nestedness Unlike the kretprobes we can't trust userspace, thus must have protection from user space attacks. User-space have "unlimited" stack, and this patch limits the return probes nestedness as a simple remedy for it. Note that this implementation leaks return_instance on siglongjmp until exit()/exec(). The intention is to have KISS and bare minimum solution for the initial implementation in order to not complicate the uretprobes code. In the future we may come up with more sophisticated solution that remove this depth limitation. It is not easy task and lays beyond this patchset. Signed-off-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- include/linux/uprobes.h | 3 +++ kernel/events/uprobes.c | 11 +++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index b0507f24eeb0..06f28beed7c2 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -38,6 +38,8 @@ struct inode; #define UPROBE_HANDLER_REMOVE 1 #define UPROBE_HANDLER_MASK 1 +#define MAX_URETPROBE_DEPTH 64 + enum uprobe_filter_ctx { UPROBE_FILTER_REGISTER, UPROBE_FILTER_UNREGISTER, @@ -72,6 +74,7 @@ struct uprobe_task { struct arch_uprobe_task autask; struct return_instance *return_instances; + unsigned int depth; struct uprobe *active_uprobe; unsigned long xol_vaddr; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 65429ad2ce51..6ab00e090c87 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1404,6 +1404,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) if (!utask) return; + if (utask->depth >= MAX_URETPROBE_DEPTH) { + printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to" + " nestedness limit pid/tgid=%d/%d\n", + current->pid, current->tgid); + return; + } + ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) goto fail; @@ -1439,6 +1446,8 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; + utask->depth++; + /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; @@ -1681,6 +1690,8 @@ static bool handle_trampoline(struct pt_regs *regs) if (!chained) break; + utask->depth--; + BUG_ON(!ri); } -- cgit v1.2.3 From a0d60aef4be10c498809c2985fc9c28fd503880d Mon Sep 17 00:00:00 2001 From: Anton Arapov <anton@redhat.com> Date: Wed, 3 Apr 2013 18:00:38 +0200 Subject: uretprobes: Remove -ENOSYS as return probes implemented Enclose return probes implementation. Signed-off-by: Anton Arapov <anton@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- kernel/events/uprobes.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6ab00e090c87..f3569747d629 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -851,10 +851,6 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * if (!uc->handler && !uc->ret_handler) return -EINVAL; - /* TODO: Implement return probes */ - if (uc->ret_handler) - return -ENOSYS; - /* Racy, just to catch the obvious mistakes */ if (offset > i_size_read(inode)) return -EINVAL; -- cgit v1.2.3 From 07720b63a964851928fa5d8b00ee5270d66b94f7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Thu, 28 Mar 2013 18:01:04 +0100 Subject: uprobes/tracing: Kill the pointless task_pt_regs() calls uprobe_trace_func() and uprobe_perf_func() do not need task_pt_regs(), we already have "struct pt_regs *regs". Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8dad2a92dee9..af5b7735cebf 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -507,7 +507,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return 0; entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(task_pt_regs(current)); + entry->ip = instruction_pointer(regs); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -777,7 +777,7 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!entry) goto out; - entry->ip = instruction_pointer(task_pt_regs(current)); + entry->ip = instruction_pointer(regs); data = (u8 *)&entry[1]; for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); -- cgit v1.2.3 From 456fdbcb8648a20f56977efbc6f13e28936fcf0b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Thu, 28 Mar 2013 18:58:11 +0100 Subject: uprobes/tracing: Kill the pointless seq_print_ip_sym() call seq_print_ip_sym(ip) in print_uprobe_event() is pointless, kallsyms_lookup(ip) can not resolve a user-space address. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index af5b7735cebf..8e009016e4ba 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -531,13 +531,7 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e field = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, call.event); - if (!trace_seq_printf(s, "%s: (", tu->call.name)) - goto partial; - - if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) - goto partial; - - if (!trace_seq_puts(s, ")")) + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, field->ip)) goto partial; data = (u8 *)&field[1]; -- cgit v1.2.3 From 0e3853d202e8b2720bc4c674dc58849b2662c8f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Thu, 28 Mar 2013 19:19:11 +0100 Subject: uprobes/tracing: Kill the pointless local_save_flags/preempt_count calls uprobe_trace_func() is never called with irqs or preemption disabled, no need to ask preempt_count() or local_save_flags(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8e009016e4ba..43d258db998e 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -492,17 +492,13 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) struct ring_buffer_event *event; struct ring_buffer *buffer; u8 *data; - int size, i, pc; - unsigned long irq_flags; + int size, i; struct ftrace_event_call *call = &tu->call; - local_save_flags(irq_flags); - pc = preempt_count(); - size = sizeof(*entry) + tu->size; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, irq_flags, pc); + size, 0, 0); if (!event) return 0; @@ -513,7 +509,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); if (!filter_current_check_discard(buffer, call, entry, event)) - trace_buffer_unlock_commit(buffer, event, irq_flags, pc); + trace_buffer_unlock_commit(buffer, event, 0, 0); return 0; } -- cgit v1.2.3 From 457d1772f1c1bcf37b2ae7fc8f1d6f303d1d5cf9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Fri, 29 Mar 2013 18:26:51 +0100 Subject: uprobes/tracing: Generalize struct uprobe_trace_entry_head struct uprobe_trace_entry_head has a single member for reporting, "unsigned long ip". If we want to support uretprobes we need to create another struct which has "func" and "ret_ip" and duplicate a lot of functions, like trace_kprobe.c does. To avoid this copy-and-paste horror we turn ->ip into ->vaddr[] and add couple of trivial helpers to calculate sizeof/data. This uglifies the code a bit, but this allows us to avoid a lot more complications later, when we add the support for ret-probes. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace.h | 5 ---- kernel/trace/trace_uprobe.c | 62 ++++++++++++++++++++++++++------------------- 2 files changed, 36 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 2081971367ea..8bed1dfcb938 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -103,11 +103,6 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; -struct uprobe_trace_entry_head { - struct trace_entry ent; - unsigned long ip; -}; - /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 43d258db998e..49b400368bfd 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -28,6 +28,18 @@ #define UPROBE_EVENT_SYSTEM "uprobes" +struct uprobe_trace_entry_head { + struct trace_entry ent; + unsigned long vaddr[]; +}; + +#define SIZEOF_TRACE_ENTRY(is_return) \ + (sizeof(struct uprobe_trace_entry_head) + \ + sizeof(unsigned long) * (is_return ? 2 : 1)) + +#define DATAOF_TRACE_ENTRY(entry, is_return) \ + ((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return)) + struct trace_uprobe_filter { rwlock_t rwlock; int nr_systemwide; @@ -491,20 +503,19 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; struct ring_buffer *buffer; - u8 *data; + void *data; int size, i; struct ftrace_event_call *call = &tu->call; - size = sizeof(*entry) + tu->size; - + size = SIZEOF_TRACE_ENTRY(false) + tu->size; event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, 0, 0); if (!event) return 0; entry = ring_buffer_event_data(event); - entry->ip = instruction_pointer(regs); - data = (u8 *)&entry[1]; + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -518,22 +529,22 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) static enum print_line_t print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) { - struct uprobe_trace_entry_head *field; + struct uprobe_trace_entry_head *entry; struct trace_seq *s = &iter->seq; struct trace_uprobe *tu; u8 *data; int i; - field = (struct uprobe_trace_entry_head *)iter->ent; + entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, call.event); - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, field->ip)) + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, entry->vaddr[0])) goto partial; - data = (u8 *)&field[1]; + data = DATAOF_TRACE_ENTRY(entry, false); for (i = 0; i < tu->nr_args; i++) { if (!tu->args[i].type->print(s, tu->args[i].name, - data + tu->args[i].offset, field)) + data + tu->args[i].offset, entry)) goto partial; } @@ -585,16 +596,17 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag) static int uprobe_event_define_fields(struct ftrace_event_call *event_call) { - int ret, i; + int ret, i, size; struct uprobe_trace_entry_head field; - struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data; + struct trace_uprobe *tu = event_call->data; - DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); /* Set argument names as fields */ for (i = 0; i < tu->nr_args; i++) { ret = trace_define_field(event_call, tu->args[i].type->fmttype, tu->args[i].name, - sizeof(field) + tu->args[i].offset, + size + tu->args[i].offset, tu->args[i].type->size, tu->args[i].type->is_signed, FILTER_OTHER); @@ -748,33 +760,31 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; - u8 *data; - int size, __size, i; - int rctx; + unsigned long ip; + void *data; + int size, rctx, i; if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; - __size = sizeof(*entry) + tu->size; - size = ALIGN(__size + sizeof(u32), sizeof(u64)); - size -= sizeof(u32); + size = SIZEOF_TRACE_ENTRY(false); + size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return 0; preempt_disable(); - entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) goto out; - entry->ip = instruction_pointer(regs); - data = (u8 *)&entry[1]; + ip = instruction_pointer(regs); + entry->vaddr[0] = ip; + data = DATAOF_TRACE_ENTRY(entry, false); for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL); - + perf_trace_buf_submit(entry, size, rctx, ip, 1, regs, head, NULL); out: preempt_enable(); return 0; @@ -784,7 +794,7 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) static int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { - struct trace_uprobe *tu = (struct trace_uprobe *)event->data; + struct trace_uprobe *tu = event->data; switch (type) { case TRACE_REG_REGISTER: -- cgit v1.2.3 From a51cc6041773dd88ff35608f54274bfd6ac68652 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 30 Mar 2013 18:02:12 +0100 Subject: uprobes/tracing: Introduce uprobe_{trace,perf}_print() helpers Extract the output code from uprobe_trace_func() and uprobe_perf_func() into the new helpers, they will be used by ->ret_handler() too. We also add the unused "unsigned long func" argument in advance, to simplify the next changes. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 49b400368bfd..0c0f0a7d4ff1 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -497,8 +497,8 @@ static const struct file_operations uprobe_profile_ops = { .release = seq_release, }; -/* uprobe handler */ -static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_trace_print(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs) { struct uprobe_trace_entry_head *entry; struct ring_buffer_event *event; @@ -511,7 +511,7 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) event = trace_current_buffer_lock_reserve(&buffer, call->event.type, size, 0, 0); if (!event) - return 0; + return; entry = ring_buffer_event_data(event); entry->vaddr[0] = instruction_pointer(regs); @@ -521,7 +521,12 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!filter_current_check_discard(buffer, call, entry, event)) trace_buffer_unlock_commit(buffer, event, 0, 0); +} +/* uprobe handler */ +static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + uprobe_trace_print(tu, 0, regs); return 0; } @@ -754,8 +759,8 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc, return ret; } -/* uprobe profile handler */ -static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +static void uprobe_perf_print(struct trace_uprobe *tu, + unsigned long func, struct pt_regs *regs) { struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; @@ -764,13 +769,10 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) void *data; int size, rctx, i; - if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) - return UPROBE_HANDLER_REMOVE; - size = SIZEOF_TRACE_ENTRY(false); size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) - return 0; + return; preempt_disable(); entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); @@ -787,6 +789,15 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) perf_trace_buf_submit(entry, size, rctx, ip, 1, regs, head, NULL); out: preempt_enable(); +} + +/* uprobe profile handler */ +static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) +{ + if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) + return UPROBE_HANDLER_REMOVE; + + uprobe_perf_print(tu, 0, regs); return 0; } #endif /* CONFIG_PERF_EVENTS */ -- cgit v1.2.3 From c1ae5c75e1034070b203dc9d4ad77ce196166a6c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 30 Mar 2013 18:25:23 +0100 Subject: uprobes/tracing: Introduce is_ret_probe() and uretprobe_dispatcher() Create the new functions we need to support uretprobes, and change alloc_trace_uprobe() to initialize consumer.ret_handler if the new "is_ret" argument is true. Curently this argument is always false, so the new code is never called and is_ret_probe(tu) is false too. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0c0f0a7d4ff1..72aa45e50d48 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -76,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock); static LIST_HEAD(uprobe_list); static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs); static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter) { @@ -89,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter) return !filter->nr_systemwide && list_empty(&filter->perf_events); } +static inline bool is_ret_probe(struct trace_uprobe *tu) +{ + return tu->consumer.ret_handler != NULL; +} + /* * Allocate new trace_uprobe and initialize it (including uprobes). */ static struct trace_uprobe * -alloc_trace_uprobe(const char *group, const char *event, int nargs) +alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret) { struct trace_uprobe *tu; @@ -118,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs) INIT_LIST_HEAD(&tu->list); tu->consumer.handler = uprobe_dispatcher; + if (is_ret) + tu->consumer.ret_handler = uretprobe_dispatcher; init_trace_uprobe_filter(&tu->filter); return tu; @@ -315,7 +324,7 @@ static int create_trace_uprobe(int argc, char **argv) kfree(tail); } - tu = alloc_trace_uprobe(group, event, argc); + tu = alloc_trace_uprobe(group, event, argc, false); if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); @@ -530,6 +539,12 @@ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) return 0; } +static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs) +{ + uprobe_trace_print(tu, func, regs); +} + /* Event entry printers */ static enum print_line_t print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -800,6 +815,12 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) uprobe_perf_print(tu, 0, regs); return 0; } + +static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func, + struct pt_regs *regs) +{ + uprobe_perf_print(tu, func, regs); +} #endif /* CONFIG_PERF_EVENTS */ static @@ -854,6 +875,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) return ret; } +static int uretprobe_dispatcher(struct uprobe_consumer *con, + unsigned long func, struct pt_regs *regs) +{ + struct trace_uprobe *tu; + + tu = container_of(con, struct trace_uprobe, consumer); + + if (tu->flags & TP_FLAG_TRACE) + uretprobe_trace_func(tu, func, regs); + +#ifdef CONFIG_PERF_EVENTS + if (tu->flags & TP_FLAG_PROFILE) + uretprobe_perf_func(tu, func, regs); +#endif + return 0; +} + static struct trace_event_functions uprobe_funcs = { .trace = print_uprobe_event }; -- cgit v1.2.3 From 393a736c280d555d9301fc5516d4d401544eb9db Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 30 Mar 2013 18:46:22 +0100 Subject: uprobes/tracing: Make uprobe_{trace,perf}_print() uretprobe-friendly Change uprobe_trace_print() and uprobe_perf_print() to check is_ret_probe() and fill ring_buffer_event accordingly. Also change uprobe_trace_func() and uprobe_perf_func() to not _print() if is_ret_probe() is true. Note that we keep ->handler() nontrivial even for uretprobe, we need this for filtering and for other potential extensions. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 72aa45e50d48..0ed99a27d122 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -516,15 +516,22 @@ static void uprobe_trace_print(struct trace_uprobe *tu, int size, i; struct ftrace_event_call *call = &tu->call; - size = SIZEOF_TRACE_ENTRY(false) + tu->size; + size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); event = trace_current_buffer_lock_reserve(&buffer, call->event.type, - size, 0, 0); + size + tu->size, 0, 0); if (!event) return; entry = ring_buffer_event_data(event); - entry->vaddr[0] = instruction_pointer(regs); - data = DATAOF_TRACE_ENTRY(entry, false); + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = instruction_pointer(regs); + data = DATAOF_TRACE_ENTRY(entry, false); + } + for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -535,7 +542,8 @@ static void uprobe_trace_print(struct trace_uprobe *tu, /* uprobe handler */ static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) { - uprobe_trace_print(tu, 0, regs); + if (!is_ret_probe(tu)) + uprobe_trace_print(tu, 0, regs); return 0; } @@ -784,7 +792,7 @@ static void uprobe_perf_print(struct trace_uprobe *tu, void *data; int size, rctx, i; - size = SIZEOF_TRACE_ENTRY(false); + size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32); if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; @@ -795,8 +803,15 @@ static void uprobe_perf_print(struct trace_uprobe *tu, goto out; ip = instruction_pointer(regs); - entry->vaddr[0] = ip; - data = DATAOF_TRACE_ENTRY(entry, false); + if (is_ret_probe(tu)) { + entry->vaddr[0] = func; + entry->vaddr[1] = ip; + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + entry->vaddr[0] = ip; + data = DATAOF_TRACE_ENTRY(entry, false); + } + for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); @@ -812,7 +827,8 @@ static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) if (!uprobe_perf_filter(&tu->consumer, 0, current->mm)) return UPROBE_HANDLER_REMOVE; - uprobe_perf_print(tu, 0, regs); + if (!is_ret_probe(tu)) + uprobe_perf_print(tu, 0, regs); return 0; } -- cgit v1.2.3 From 4d1298e2124767b4e263498485618b2e91aee5f0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 30 Mar 2013 19:23:15 +0100 Subject: uprobes/tracing: Make register_uprobe_event() paths uretprobe-friendly Change uprobe_event_define_fields(), and __set_print_fmt() to check is_ret_probe() and use the appropriate format/fields. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 0ed99a27d122..457576215b3a 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -628,8 +628,14 @@ static int uprobe_event_define_fields(struct ftrace_event_call *event_call) struct uprobe_trace_entry_head field; struct trace_uprobe *tu = event_call->data; - DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); - size = SIZEOF_TRACE_ENTRY(false); + if (is_ret_probe(tu)) { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0); + size = SIZEOF_TRACE_ENTRY(true); + } else { + DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0); + size = SIZEOF_TRACE_ENTRY(false); + } /* Set argument names as fields */ for (i = 0; i < tu->nr_args; i++) { ret = trace_define_field(event_call, tu->args[i].type->fmttype, @@ -652,8 +658,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len) int i; int pos = 0; - fmt = "(%lx)"; - arg = "REC->" FIELD_STRING_IP; + if (is_ret_probe(tu)) { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } else { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } /* When len=0, we just calculate the needed length */ -- cgit v1.2.3 From 3ede82dd3e3deb23429f2bf44fb600f440eef84b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 30 Mar 2013 19:48:09 +0100 Subject: uprobes/tracing: Make seq_printf() code uretprobe-friendly Change probes_seq_show() and print_uprobe_event() to check is_ret_probe() and print the correct data. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 457576215b3a..8c9f4894bcc0 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -435,9 +435,10 @@ static void probes_seq_stop(struct seq_file *m, void *v) static int probes_seq_show(struct seq_file *m, void *v) { struct trace_uprobe *tu = v; + char c = is_ret_probe(tu) ? 'r' : 'p'; int i; - seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name); + seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name); seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); for (i = 0; i < tu->nr_args; i++) @@ -566,10 +567,18 @@ print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *e entry = (struct uprobe_trace_entry_head *)iter->ent; tu = container_of(event, struct trace_uprobe, call.event); - if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, entry->vaddr[0])) - goto partial; + if (is_ret_probe(tu)) { + if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name, + entry->vaddr[1], entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, true); + } else { + if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name, + entry->vaddr[0])) + goto partial; + data = DATAOF_TRACE_ENTRY(entry, false); + } - data = DATAOF_TRACE_ENTRY(entry, false); for (i = 0; i < tu->nr_args; i++) { if (!tu->args[i].type->print(s, tu->args[i].name, data + tu->args[i].offset, entry)) -- cgit v1.2.3 From 4ee5a52ed6301d0afa56cc995ef2c3795f45e801 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 30 Mar 2013 20:28:15 +0100 Subject: uprobes/tracing: Change create_trace_uprobe() to support uretprobes Finally change create_trace_uprobe() to check if argv[0][0] == 'r' and pass the correct "is_ret" to alloc_trace_uprobe(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> Tested-by: Anton Arapov <anton@redhat.com> --- kernel/trace/trace_uprobe.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8c9f4894bcc0..2d08bea638a4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -201,7 +201,7 @@ end: /* * Argument syntax: - * - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] + * - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS] * * - Remove uprobe: -:[GRP/]EVENT */ @@ -213,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv) char buf[MAX_EVENT_NAME_LEN]; struct path path; unsigned long offset; - bool is_delete; + bool is_delete, is_return; int i, ret; inode = NULL; ret = 0; is_delete = false; + is_return = false; event = NULL; group = NULL; /* argc must be >= 1 */ if (argv[0][0] == '-') is_delete = true; + else if (argv[0][0] == 'r') + is_return = true; else if (argv[0][0] != 'p') { - pr_info("Probe definition must be started with 'p' or '-'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or '-'.\n"); return -EINVAL; } @@ -324,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv) kfree(tail); } - tu = alloc_trace_uprobe(group, event, argc, false); + tu = alloc_trace_uprobe(group, event, argc, is_return); if (IS_ERR(tu)) { pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu)); ret = PTR_ERR(tu); -- cgit v1.2.3 From 32520b2c695b23221751eb09360a6a3dd3105b52 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Wed, 10 Apr 2013 16:25:49 +0200 Subject: uprobes/tracing: Don't pass addr=ip to perf_trace_buf_submit() uprobe_perf_print() passes addr=ip to perf_trace_buf_submit() for no reason. This sets perf_sample_data->addr for PERF_SAMPLE_ADDR, we already have perf_sample_data->ip initialized if PERF_SAMPLE_IP. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com> --- kernel/trace/trace_uprobe.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 2d08bea638a4..37ccb72f0f3b 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -811,7 +811,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu, struct ftrace_event_call *call = &tu->call; struct uprobe_trace_entry_head *entry; struct hlist_head *head; - unsigned long ip; void *data; int size, rctx, i; @@ -825,13 +824,12 @@ static void uprobe_perf_print(struct trace_uprobe *tu, if (!entry) goto out; - ip = instruction_pointer(regs); if (is_ret_probe(tu)) { entry->vaddr[0] = func; - entry->vaddr[1] = ip; + entry->vaddr[1] = instruction_pointer(regs); data = DATAOF_TRACE_ENTRY(entry, true); } else { - entry->vaddr[0] = ip; + entry->vaddr[0] = instruction_pointer(regs); data = DATAOF_TRACE_ENTRY(entry, false); } @@ -839,7 +837,7 @@ static void uprobe_perf_print(struct trace_uprobe *tu, call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); head = this_cpu_ptr(call->perf_events); - perf_trace_buf_submit(entry, size, rctx, ip, 1, regs, head, NULL); + perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); } -- cgit v1.2.3 From 935d8aabd4331f47a89c3e1daa5779d23cf244ee Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Sun, 14 Apr 2013 10:06:31 -0700 Subject: Add file_ns_capable() helper function for open-time capability checking Nothing is using it yet, but this will allow us to delay the open-time checks to use time, without breaking the normal UNIX permission semantics where permissions are determined by the opener (and the file descriptor can then be passed to a different process, or the process can drop capabilities). Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/capability.h | 2 ++ kernel/capability.c | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) (limited to 'kernel') diff --git a/include/linux/capability.h b/include/linux/capability.h index 98503b792369..d9a4f7f40f32 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -35,6 +35,7 @@ struct cpu_vfs_cap_data { #define _KERNEL_CAP_T_SIZE (sizeof(kernel_cap_t)) +struct file; struct inode; struct dentry; struct user_namespace; @@ -211,6 +212,7 @@ extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); extern bool nsown_capable(int cap); extern bool inode_capable(const struct inode *inode, int cap); +extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); /* audit system wants to get cap info from files as well */ extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps); diff --git a/kernel/capability.c b/kernel/capability.c index 493d97259484..f6c2ce5701e1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap) } EXPORT_SYMBOL(ns_capable); +/** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file: The file we want to check + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ + if (WARN_ON_ONCE(!cap_valid(cap))) + return false; + + if (security_capable(file->f_cred, ns, cap) == 0) + return true; + + return false; +} +EXPORT_SYMBOL(file_ns_capable); + /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for -- cgit v1.2.3 From da1f296fd2bfd5ad3c53d72a1ece593e821cf374 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 14 Apr 2013 10:32:19 -0700 Subject: cgroup: make cgroup_path() not print double slashes While reimplementing cgroup_path(), 65dff759d2 ("cgroup: fix cgroup_path() vs rename() race") introduced a bug where the path of a non-root cgroup would have two slahses at the beginning, which is caused by treating the root cgroup which has the name '/' like non-root cgroups. $ grep systemd /proc/self/cgroup 1:name=systemd://user/root/1 Fix it by special casing root cgroup case and not looping over it in the normal path. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Li Zefan <lizefan@huawei.com> --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 678a22c75fdb..faf55f59646b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1811,11 +1811,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) int ret = -ENAMETOOLONG; char *start; + if (!cgrp->parent) { + if (strlcpy(buf, "/", buflen) >= buflen) + return -ENAMETOOLONG; + return 0; + } + start = buf + buflen - 1; *start = '\0'; rcu_read_lock(); - while (cgrp) { + do { const char *name = cgroup_name(cgrp); int len; @@ -1824,15 +1830,12 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) goto out; memcpy(start, name, len); - if (!cgrp->parent) - break; - if (--start < buf) goto out; *start = '/'; cgrp = cgrp->parent; - } + } while (cgrp->parent); ret = 0; memmove(buf, start, buf + buflen - start); out: -- cgit v1.2.3 From 6708075f104c3c9b04b23336bb0366ca30c3931b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" <ebiederm@xmission.com> Date: Sun, 14 Apr 2013 13:47:02 -0700 Subject: userns: Don't let unprivileged users trick privileged users into setting the id_map When we require privilege for setting /proc/<pid>/uid_map or /proc/<pid>/gid_map no longer allow an unprivileged user to open the file and pass it to a privileged program to write to the file. Instead when privilege is required require both the opener and the writer to have the necessary capabilities. I have tested this code and verified that setting /proc/<pid>/uid_map fails when an unprivileged user opens the file and a privielged user attempts to set the mapping, that unprivileged users can still map their own id, and that a privileged users can still setup an arbitrary mapping. Reported-by: Andy Lutomirski <luto@amacapital.net> Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- kernel/user_namespace.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index a54f26f82eb2..e2d4ace4481b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -25,7 +25,8 @@ static struct kmem_cache *user_ns_cachep __read_mostly; -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) @@ -700,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ - if (!new_idmap_permitted(ns, cap_setid, &new_map)) + if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; /* Map the lower ids from the parent user namespace to the @@ -787,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { /* Allow mapping to your own filesystem ids */ @@ -811,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. + * And the opener of the id file also had the approprpiate capability. */ - if (ns_capable(ns->parent, cap_setid)) + if (ns_capable(ns->parent, cap_setid) && + file_ns_capable(file, ns->parent, cap_setid)) return true; return false; -- cgit v1.2.3 From e3211c120a85b792978bcb4be7b2886df18d27f0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski <luto@amacapital.net> Date: Sun, 14 Apr 2013 16:28:19 -0700 Subject: userns: Check uid_map's opener's fsuid, not the current fsuid Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- kernel/user_namespace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e2d4ace4481b..5c16f3aa757a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -797,12 +797,12 @@ static bool new_idmap_permitted(const struct file *file, u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, current_fsuid())) + if (uid_eq(uid, file->f_cred->fsuid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, current_fsgid())) + if (gid_eq(gid, file->f_cred->fsgid)) return true; } } -- cgit v1.2.3 From 41c21e351e79004dbb4efa4bc14a53a7e0af38c5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski <luto@amacapital.net> Date: Sun, 14 Apr 2013 11:44:04 -0700 Subject: userns: Changing any namespace id mappings should require privileges Changing uid/gid/projid mappings doesn't change your id within the namespace; it reconfigures the namespace. Unprivileged programs should *not* be able to write these files. (We're also checking the privileges on the wrong task.) Given the write-once nature of these files and the other security checks, this is likely impossible to usefully exploit. Signed-off-by: Andy Lutomirski <luto@amacapital.net> --- kernel/user_namespace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 5c16f3aa757a..e134d8f365dd 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -613,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (map->nr_extents != 0) goto out; - /* Require the appropriate privilege CAP_SETUID or CAP_SETGID - * over the user namespace in order to set the id mapping. + /* + * Adjusting namespace settings requires capabilities on the target. */ - if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) + if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; /* Get a buffer */ -- cgit v1.2.3 From 9343862945fdd3dd5cb7648bb24cabe40faa9ad9 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 14 Apr 2013 20:15:25 -0700 Subject: cgroup: convert cgroupfs_root flag bits to masks and add CGRP_ prefix There's no reason to be using bitops, which tends to be more cumbersome, to handle root flags. Convert them to masks. Also, as they'll be moved to include/linux/cgroup.h and it's generally a good idea, add CGRP_ prefix. Note that flags are assigned from (1 << 1). The first bit will be used by a flag which will be added soon. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com> Acked-by: Li Zefan <lizefan@huawei.com> --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index faf55f59646b..67428d84e38b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -296,10 +296,10 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) } EXPORT_SYMBOL_GPL(cgroup_is_descendant); -/* bits in struct cgroupfs_root flags field */ +/* cgroupfs_root->flags */ enum { - ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ - ROOT_XATTR, /* supports extended attributes */ + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ + CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ }; static int cgroup_is_releasable(const struct cgroup *cgrp) @@ -1137,9 +1137,9 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); - if (test_bit(ROOT_NOPREFIX, &root->flags)) + if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); - if (test_bit(ROOT_XATTR, &root->flags)) + if (root->flags & CGRP_ROOT_XATTR) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); @@ -1202,7 +1202,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "noprefix")) { - set_bit(ROOT_NOPREFIX, &opts->flags); + opts->flags |= CGRP_ROOT_NOPREFIX; continue; } if (!strcmp(token, "clone_children")) { @@ -1210,7 +1210,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "xattr")) { - set_bit(ROOT_XATTR, &opts->flags); + opts->flags |= CGRP_ROOT_XATTR; continue; } if (!strncmp(token, "release_agent=", 14)) { @@ -1293,8 +1293,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) * with the old cpuset, so we allow noprefix only if mounting just * the cpuset subsystem. */ - if (test_bit(ROOT_NOPREFIX, &opts->flags) && - (opts->subsys_mask & mask)) + if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) return -EINVAL; @@ -2526,7 +2525,7 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry) static inline int xattr_enabled(struct dentry *dentry) { struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - return test_bit(ROOT_XATTR, &root->flags); + return root->flags & CGRP_ROOT_XATTR; } static bool is_valid_xattr(const char *name) @@ -2698,7 +2697,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, simple_xattrs_init(&cft->xattrs); - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, subsys->name); strcat(name, "."); } -- cgit v1.2.3 From 25a7e6848db76e22677aff202d9c4ef3503be15b Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 14 Apr 2013 20:15:25 -0700 Subject: move cgroupfs_root to include/linux/cgroup.h While controllers shouldn't be accessing cgroupfs_root directly, it being hidden inside kern/cgroup.c makes somethings pretty silly. This makes routing hierarchy-wide settings which need to be visible to controllers cumbersome. We're gonna add another hierarchy-wide setting which needs to be accessed from controllers. Move cgroupfs_root and its flags to the header file so that we can access root settings with inline helpers. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com> Acked-by: Li Zefan <lizefan@huawei.com> --- include/linux/cgroup.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup.c | 57 -------------------------------------------------- 2 files changed, 57 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 45aee0fc6b98..b21881e1ea08 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -19,6 +19,7 @@ #include <linux/idr.h> #include <linux/workqueue.h> #include <linux/xattr.h> +#include <linux/fs.h> #ifdef CONFIG_CGROUPS @@ -238,6 +239,62 @@ struct cgroup { struct simple_xattrs xattrs; }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + +/* cgroupfs_root->flags */ +enum { + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ + CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ +}; + +/* + * A cgroupfs_root represents the root of a cgroup hierarchy, and may be + * associated with a superblock to form an active hierarchy. This is + * internal to cgroup core. Don't access directly from controllers. + */ +struct cgroupfs_root { + struct super_block *sb; + + /* + * The bitmask of subsystems intended to be attached to this + * hierarchy + */ + unsigned long subsys_mask; + + /* Unique id for this hierarchy. */ + int hierarchy_id; + + /* The bitmask of subsystems currently attached to this hierarchy */ + unsigned long actual_subsys_mask; + + /* A list running through the attached subsystems */ + struct list_head subsys_list; + + /* The root cgroup for this hierarchy */ + struct cgroup top_cgroup; + + /* Tracks how many cgroups are currently defined in hierarchy.*/ + int number_of_cgroups; + + /* A list running through the active hierarchies */ + struct list_head root_list; + + /* All cgroups on this root, cgroup_mutex protected */ + struct list_head allcg_list; + + /* Hierarchy-specific flags */ + unsigned long flags; + + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + + /* The path to use for release notifications. */ + char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; +}; + /* * A css_set is a structure holding pointers to a set of * cgroup_subsys_state objects. This saves space in the task struct diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 67428d84e38b..8b8eb7c168ff 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -30,7 +30,6 @@ #include <linux/cred.h> #include <linux/ctype.h> #include <linux/errno.h> -#include <linux/fs.h> #include <linux/init_task.h> #include <linux/kernel.h> #include <linux/list.h> @@ -104,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include <linux/cgroup_subsys.h> }; -#define MAX_CGROUP_ROOT_NAMELEN 64 - -/* - * A cgroupfs_root represents the root of a cgroup hierarchy, - * and may be associated with a superblock to form an active - * hierarchy - */ -struct cgroupfs_root { - struct super_block *sb; - - /* - * The bitmask of subsystems intended to be attached to this - * hierarchy - */ - unsigned long subsys_mask; - - /* Unique id for this hierarchy. */ - int hierarchy_id; - - /* The bitmask of subsystems currently attached to this hierarchy */ - unsigned long actual_subsys_mask; - - /* A list running through the attached subsystems */ - struct list_head subsys_list; - - /* The root cgroup for this hierarchy */ - struct cgroup top_cgroup; - - /* Tracks how many cgroups are currently defined in hierarchy.*/ - int number_of_cgroups; - - /* A list running through the active hierarchies */ - struct list_head root_list; - - /* All cgroups on this root, cgroup_mutex protected */ - struct list_head allcg_list; - - /* Hierarchy-specific flags */ - unsigned long flags; - - /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; - - /* The path to use for release notifications. */ - char release_agent_path[PATH_MAX]; - - /* The name for this hierarchy - may be empty */ - char name[MAX_CGROUP_ROOT_NAMELEN]; -}; - /* * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the * subsystems that are otherwise unattached - it never has more than a @@ -296,12 +245,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) } EXPORT_SYMBOL_GPL(cgroup_is_descendant); -/* cgroupfs_root->flags */ -enum { - CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ - CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ -}; - static int cgroup_is_releasable(const struct cgroup *cgrp) { const int bits = -- cgit v1.2.3 From 873fe09ea5df6ccf6bb34811d8c9992aacb67598 Mon Sep 17 00:00:00 2001 From: Tejun Heo <tj@kernel.org> Date: Sun, 14 Apr 2013 20:15:26 -0700 Subject: cgroup: introduce sane_behavior mount option It's a sad fact that at this point various cgroup controllers are carrying so many idiosyncrasies and pure insanities that it simply isn't possible to reach any sort of sane consistent behavior while maintaining staying fully compatible with what already has been exposed to userland. As we can't break exposed userland interface, transitioning to sane behaviors can only be done in steps while maintaining backwards compatibility. This patch introduces a new mount option - __DEVEL__sane_behavior - which disables crazy features and enforces consistent behaviors in cgroup core proper and various controllers. As exactly which behaviors it changes are still being determined, the mount option, at this point, is useful only for development of the new behaviors. As such, the mount option is prefixed with __DEVEL__ and generates a warning message when used. Eventually, once we get to the point where all controller's behaviors are consistent enough to implement unified hierarchy, the __DEVEL__ prefix will be dropped, and more importantly, unified-hierarchy will enforce sane_behavior by default. Maybe we'll able to completely drop the crazy stuff after a while, maybe not, but we at least have a strategy to move on to saner behaviors. This patch introduces the mount option and changes the following behaviors in cgroup core. * Mount options "noprefix" and "clone_children" are disallowed. Also, cgroupfs file cgroup.clone_children is not created. * When mounting an existing superblock, mount options should match. This is currently pretty crazy. If one mounts a cgroup, creates a subdirectory, unmounts it and then mount it again with different option, it looks like the new options are applied but they aren't. * Remount is disallowed. The behaviors changes are documented in the comment above CGRP_ROOT_SANE_BEHAVIOR enum and will be expanded as different controllers are converted and planned improvements progress. v2: Dropped unnecessary explicit file permission setting sane_behavior cftype entry as suggested by Li Zefan. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Michal Hocko <mhocko@suse.cz> Cc: Vivek Goyal <vgoyal@redhat.com> --- include/linux/cgroup.h | 43 +++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b21881e1ea08..9c300ad9a911 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -156,6 +156,8 @@ enum { * specified at mount time and thus is implemented here. */ CGRP_CPUSET_CLONE_CHILDREN, + /* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */ + CGRP_SANE_BEHAVIOR, }; struct cgroup_name { @@ -243,6 +245,37 @@ struct cgroup { /* cgroupfs_root->flags */ enum { + /* + * Unfortunately, cgroup core and various controllers are riddled + * with idiosyncrasies and pointless options. The following flag, + * when set, will force sane behavior - some options are forced on, + * others are disallowed, and some controllers will change their + * hierarchical or other behaviors. + * + * The set of behaviors affected by this flag are still being + * determined and developed and the mount option for this flag is + * prefixed with __DEVEL__. The prefix will be dropped once we + * reach the point where all behaviors are compatible with the + * planned unified hierarchy, which will automatically turn on this + * flag. + * + * The followings are the behaviors currently affected this flag. + * + * - Mount options "noprefix" and "clone_children" are disallowed. + * Also, cgroupfs file cgroup.clone_children is not created. + * + * - When mounting an existing superblock, mount options should + * match. + * + * - Remount is disallowed. + * + * The followings are planned changes. + * + * - release_agent will be disallowed once replacement notification + * mechanism is implemented. + */ + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), + CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ }; @@ -360,6 +393,7 @@ struct cgroup_map_cb { /* cftype->flags */ #define CFTYPE_ONLY_ON_ROOT (1U << 0) /* only create on root cg */ #define CFTYPE_NOT_ON_ROOT (1U << 1) /* don't create on root cg */ +#define CFTYPE_INSANE (1U << 2) /* don't create if sane_behavior */ #define MAX_CFTYPE_NAME 64 @@ -486,6 +520,15 @@ struct cgroup_scanner { void *data; }; +/* + * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This + * function can be called as long as @cgrp is accessible. + */ +static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) +{ + return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; +} + /* Caller should hold rcu_read_lock() */ static inline const char *cgroup_name(const struct cgroup *cgrp) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8b8eb7c168ff..67804590d4b0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1080,6 +1080,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) mutex_lock(&cgroup_root_mutex); for_each_subsys(root, ss) seq_printf(seq, ",%s", ss->name); + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) + seq_puts(seq, ",sane_behavior"); if (root->flags & CGRP_ROOT_NOPREFIX) seq_puts(seq, ",noprefix"); if (root->flags & CGRP_ROOT_XATTR) @@ -1144,6 +1146,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } + if (!strcmp(token, "__DEVEL__sane_behavior")) { + opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; + continue; + } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1231,6 +1237,20 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) /* Consistency checks */ + if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); + + if (opts->flags & CGRP_ROOT_NOPREFIX) { + pr_err("cgroup: sane_behavior: noprefix is not allowed\n"); + return -EINVAL; + } + + if (opts->cpuset_clone_children) { + pr_err("cgroup: sane_behavior: clone_children is not allowed\n"); + return -EINVAL; + } + } + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -1307,6 +1327,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) struct cgroup_sb_opts opts; unsigned long added_mask, removed_mask; + if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { + pr_err("cgroup: sane_behavior: remount is not allowed\n"); + return -EINVAL; + } + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1657,6 +1682,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + + if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) && + root->flags != opts.flags) { + pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n"); + ret = -EINVAL; + goto drop_new_super; + } + /* no subsys rebinding, so refcounts don't change */ drop_parsed_module_refcounts(opts.subsys_mask); } @@ -2200,6 +2233,13 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } +static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + return 0; +} + /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2681,6 +2721,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) + continue; if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) continue; if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) @@ -3918,9 +3960,15 @@ static struct cftype files[] = { }, { .name = "cgroup.clone_children", + .flags = CFTYPE_INSANE, .read_u64 = cgroup_clone_children_read, .write_u64 = cgroup_clone_children_write, }, + { + .name = "cgroup.sane_behavior", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_seq_string = cgroup_sane_behavior_show, + }, { .name = "release_agent", .flags = CFTYPE_ONLY_ON_ROOT, -- cgit v1.2.3 From 05fb22ec5456a472a5eadcaacb3e51eca1f8c79c Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Mon, 15 Apr 2013 14:25:05 +0800 Subject: cgroup: remove cgrp->top_cgroup It's not used, and it can be retrieved via cgrp->root->top_cgroup. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 1 - kernel/cgroup.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9c300ad9a911..64047ae7fde1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -204,7 +204,6 @@ struct cgroup { struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; struct cgroupfs_root *root; - struct cgroup *top_cgroup; /* * List of cg_cgroup_links pointing at css_sets with diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 67804590d4b0..c16719e056ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1418,7 +1418,6 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->name = &root_cgroup_name; - cgrp->top_cgroup = cgrp; init_cgroup_housekeeping(cgrp); list_add_tail(&cgrp->allcg_node, &root->allcg_list); } @@ -4145,7 +4144,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->parent = parent; cgrp->root = parent->root; - cgrp->top_cgroup = parent->top_cgroup; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); -- cgit v1.2.3 From 8176cced706b5e5d15887584150764894e94e02f Mon Sep 17 00:00:00 2001 From: Tommi Rantala <tt.rantala@gmail.com> Date: Sat, 13 Apr 2013 22:49:14 +0300 Subject: perf: Treat attr.config as u64 in perf_swevent_init() Trinity discovered that we fail to check all 64 bits of attr.config passed by user space, resulting to out-of-bounds access of the perf_swevent_enabled array in sw_perf_event_destroy(). Introduced in commit b0a873ebb ("perf: Register PMU implementations"). Signed-off-by: Tommi Rantala <tt.rantala@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: davej@redhat.com Cc: Paul Mackerras <paulus@samba.org> Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net> Link: http://lkml.kernel.org/r/1365882554-30259-1-git-send-email-tt.rantala@gmail.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7e0962ed7f8a..4d3124b39277 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5331,7 +5331,7 @@ static void sw_perf_event_destroy(struct perf_event *event) static int perf_swevent_init(struct perf_event *event) { - int event_id = event->attr.config; + u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; -- cgit v1.2.3 From 515619f209114697fabd21eed1623bfa69746815 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Sat, 13 Apr 2013 15:36:49 +0200 Subject: uprobes/perf: Avoid perf_trace_buf_prepare/submit if ->perf_events is empty perf_trace_buf_prepare() + perf_trace_buf_submit() make no sense if this task/CPU has no active counters. Change uprobe_perf_print() to return if hlist_empty(call->perf_events). Note: this is not uprobe-specific, we can change other users too. Signed-off-by: Oleg Nesterov <oleg@redhat.com> --- kernel/trace/trace_uprobe.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 37ccb72f0f3b..32494fb0ee64 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -820,6 +820,10 @@ static void uprobe_perf_print(struct trace_uprobe *tu, return; preempt_disable(); + head = this_cpu_ptr(call->perf_events); + if (hlist_empty(head)) + goto out; + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) goto out; @@ -836,7 +840,6 @@ static void uprobe_perf_print(struct trace_uprobe *tu, for (i = 0; i < tu->nr_args; i++) call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); - head = this_cpu_ptr(call->perf_events); perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL); out: preempt_enable(); -- cgit v1.2.3 From d190e8195b90bc1e65c494fe08e54e9e581bfd16 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Wed, 17 Apr 2013 10:33:13 +0200 Subject: idle: Remove GENERIC_IDLE_LOOP config switch All archs are converted over. Remove the config switch and the fallback code. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- arch/Kconfig | 3 --- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/avr32/Kconfig | 1 - arch/blackfin/Kconfig | 1 - arch/c6x/Kconfig | 1 - arch/cris/Kconfig | 1 - arch/frv/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m32r/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/metag/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/mn10300/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/score/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/tile/Kconfig | 1 - arch/um/Kconfig.common | 1 - arch/unicore32/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - kernel/cpu/idle.c | 8 -------- 32 files changed, 41 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index a699f3767be4..1455579791ec 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -216,9 +216,6 @@ config USE_GENERIC_SMP_HELPERS config GENERIC_SMP_IDLE_THREAD bool -config GENERIC_IDLE_LOOP - bool - # Select if arch init_task initializer is different to init/init_task.c config ARCH_INIT_TASK bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 60469820a6c5..8a33ba01301f 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -17,7 +17,6 @@ config ALPHA select ARCH_WANT_IPC_PARSE_VERSION select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select GENERIC_CMOS_UPDATE select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index b006977a1516..e6f4eca09ee3 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -14,7 +14,6 @@ config ARC select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS select GENERIC_FIND_FIRST_BIT - select GENERIC_IDLE_LOOP # for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP select GENERIC_IRQ_SHOW select GENERIC_KERNEL_EXECVE diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 128551fcc6dd..fcedd612c54c 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -15,7 +15,6 @@ config ARM select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select GENERIC_IDLE_POLL_SETUP select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ca2c871795c5..9b6d19f74078 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -9,7 +9,6 @@ config ARM64 select CLONE_BACKWARDS select COMMON_CLK select GENERIC_CLOCKEVENTS - select GENERIC_IDLE_LOOP select GENERIC_IOMAP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index bbecda4c3373..c1a868d398bd 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -10,7 +10,6 @@ config AVR32 select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_ATOMIC64 - select GENERIC_IDLE_LOOP select HARDIRQS_SW_RESEND select GENERIC_IRQ_SHOW select ARCH_HAVE_CUSTOM_GPIO_H diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 3d769a7c4941..c3f2e0bc644a 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -41,7 +41,6 @@ config BLACKFIN select USE_GENERIC_SMP_HELPERS if SMP select HAVE_NMI_WATCHDOG if NMI_WATCHDOG select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_RELA diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index af2aa4b44140..f6a3648f5ec3 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -18,7 +18,6 @@ config C6X select OF_EARLY_FLATTREE select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_IDLE_LOOP config MMU def_bool n diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index 1dd36355a3cd..06dd026533e3 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -48,7 +48,6 @@ config CRIS select GENERIC_IRQ_SHOW select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD if ETRAX_ARCH_V32 - select GENERIC_IDLE_LOOP select GENERIC_CMOS_UPDATE select MODULES_USE_ELF_RELA select CLONE_BACKWARDS2 diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 0d998db2039e..2ce731f9aa4d 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -8,7 +8,6 @@ config FRV select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_SHOW - select GENERIC_IDLE_LOOP select HAVE_DEBUG_BUGVERBOSE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CPU_DEVICES diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 5374975fe800..79250de1b12a 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -8,7 +8,6 @@ config H8300 select VIRT_TO_BUS select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_IRQ_SHOW - select GENERIC_IDLE_LOOP select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 6e3710e84a6c..e4decc6b8947 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -24,7 +24,6 @@ config HEXAGON select NO_IOPORT select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select STACKTRACE_SUPPORT select KTIME_SCALAR select GENERIC_CLOCKEVENTS diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index e0b39c3cb789..9a02f71c6b1f 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -35,7 +35,6 @@ config IA64 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select ARCH_INIT_TASK select ARCH_TASK_STRUCT_ALLOCATOR select ARCH_THREAD_INFO_ALLOCATOR diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index af814fe478ae..bcd17b206571 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -13,7 +13,6 @@ config M32R select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW - select GENERIC_IDLE_LOOP select GENERIC_ATOMIC64 select ARCH_USES_GETTIMEOFFSET select MODULES_USE_ELF_RELA diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 41859405a853..6de813370b8c 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -6,7 +6,6 @@ config M68K select HAVE_DEBUG_BUGVERBOSE select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW - select GENERIC_IDLE_LOOP select GENERIC_ATOMIC64 select HAVE_UID16 select VIRT_TO_BUS diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 3164f6cebe2b..afc8973d1488 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -7,7 +7,6 @@ config METAG select EMBEDDED select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS - select GENERIC_IDLE_LOOP select GENERIC_IRQ_SHOW select GENERIC_SMP_IDLE_THREAD select HAVE_64BIT_ALIGNED_ACCESS diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 0bb0d519a233..a827057c7927 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -26,7 +26,6 @@ config MICROBLAZE select GENERIC_CPU_DEVICES select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS - select GENERIC_IDLE_LOOP select GENERIC_IDLE_POLL_SETUP select MODULES_USE_ELF_RELA select CLONE_BACKWARDS diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index e1a3d02af637..51244bf97271 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -34,7 +34,6 @@ config MIPS select HAVE_MEMBLOCK_NODE_MAP select ARCH_DISCARD_MEMBLOCK select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select BUILDTIME_EXTABLE_SORT select GENERIC_CLOCKEVENTS select GENERIC_CMOS_UPDATE diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index ae7158b69c9c..428da175d073 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -13,7 +13,6 @@ config MN10300 select MODULES_USE_ELF_RELA select OLD_SIGSUSPEND3 select OLD_SIGACTION - select GENERIC_IDLE_LOOP config AM33_2 def_bool n diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index e111b5200cd9..9ab3bf2eca8d 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -21,7 +21,6 @@ config OPENRISC select GENERIC_CLOCKEVENTS select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_IDLE_LOOP select MODULES_USE_ELF_RELA config MMU diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 0821e702d03f..0339181bf3ac 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -18,7 +18,6 @@ config PARISC select GENERIC_PCI_IOMAP select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select GENERIC_STRNCPY_FROM_USER select SYSCTL_ARCH_UNALIGN_ALLOW select HAVE_MOD_ARCH_SPECIFIC diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 19de32c52da5..ea5bb045983a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -133,7 +133,6 @@ config PPC select HAVE_ARCH_JUMP_LABEL select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select GENERIC_CMOS_UPDATE select GENERIC_TIME_VSYSCALL_OLD select GENERIC_CLOCKEVENTS diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 749513d73e47..eb8fb629f00b 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -97,7 +97,6 @@ config S390 select CLONE_BACKWARDS2 select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES if !SMP - select GENERIC_IDLE_LOOP select GENERIC_KERNEL_THREAD select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL_OLD diff --git a/arch/score/Kconfig b/arch/score/Kconfig index eaac45de65be..c8def8bc9020 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -11,7 +11,6 @@ config SCORE select ARCH_DISCARD_MEMBLOCK select GENERIC_CPU_DEVICES select GENERIC_CLOCKEVENTS - select GENERIC_IDLE_LOOP select HAVE_MOD_ARCH_SPECIFIC select VIRT_TO_BUS select MODULES_USE_ELF_REL diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 0917fc3f39c0..1ea597c6497a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -33,7 +33,6 @@ config SUPERH select GENERIC_ATOMIC64 select GENERIC_IRQ_SHOW select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select GENERIC_IDLE_POLL_SETUP select GENERIC_CLOCKEVENTS select GENERIC_CMOS_UPDATE if SH_SH03 || SH_DREAMCAST diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index ee5eacc5a649..3d361f236308 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -37,7 +37,6 @@ config SPARC select GENERIC_SMP_IDLE_THREAD select GENERIC_CMOS_UPDATE select GENERIC_CLOCKEVENTS - select GENERIC_IDLE_LOOP select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 85b2edeade31..25877aebc685 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -22,7 +22,6 @@ config TILE select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA - select GENERIC_IDLE_LOOP # FIXME: investigate whether we need/want these options. # select HAVE_IOREMAP_PROT diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index 57834067a511..bceee6623b00 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -12,7 +12,6 @@ config UML select GENERIC_CPU_DEVICES select GENERIC_IO select GENERIC_CLOCKEVENTS - select GENERIC_IDLE_LOOP select TTY # Needed for line.c config MMU diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index e12678daaed1..2943e3acdf0c 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -17,7 +17,6 @@ config UNICORE32 select ARCH_WANT_FRAME_POINTERS select GENERIC_IOMAP select MODULES_USE_ELF_REL - select GENERIC_IDLE_LOOP help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 734152d85fd0..fcf293994992 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -97,7 +97,6 @@ config X86 select GENERIC_IOMAP select DCACHE_WORD_ACCESS select GENERIC_SMP_IDLE_THREAD - select GENERIC_IDLE_LOOP select ARCH_WANT_IPC_PARSE_VERSION if X86_32 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index e0144ff4624d..b09de49dbec5 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -19,7 +19,6 @@ config XTENSA select CLONE_BACKWARDS select IRQ_DOMAIN select HAVE_OPROFILE - select GENERIC_IDLE_LOOP help Xtensa processors are 32-bit RISC machines designed by Tensilica primarily for embedded systems. These processors are both diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index 54c320383934..168cf407a254 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -10,13 +10,6 @@ #include <trace/events/power.h> -#ifndef CONFIG_GENERIC_IDLE_LOOP -void cpu_startup_entry(enum cpuhp_state state) -{ - cpu_idle(); -} -#else - static int __read_mostly cpu_idle_force_poll; void cpu_idle_poll_ctrl(bool enable) @@ -112,4 +105,3 @@ void cpu_startup_entry(enum cpuhp_state state) arch_cpu_idle_prepare(); cpu_idle_loop(); } -#endif -- cgit v1.2.3 From 55a20ee7804ab64ac90bcdd4e2868a42829e2784 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Mon, 15 Apr 2013 22:23:47 -0700 Subject: x86, kdump: Retore crashkernel= to allocate under 896M Vivek found old kexec-tools does not work new kernel anymore. So change back crashkernel= back to old behavoir, and add crashkernel_high= to let user decide if buffer could be above 4G, and also new kexec-tools will be needed. -v2: let crashkernel=X override crashkernel_high= update description about _high will be ignored by crashkernel=X -v3: update description about kernel-parameters.txt according to Vivek. Signed-off-by: Yinghai Lu <yinghai@kernel.org> Link: http://lkml.kernel.org/r/1366089828-19692-4-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> --- Documentation/kernel-parameters.txt | 13 +++++++++++-- arch/x86/kernel/setup.c | 24 +++++++++++++++++++----- include/linux/kexec.h | 2 ++ kernel/kexec.c | 9 +++++++++ 4 files changed, 41 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cff672da2486..709eb3edc6b2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -603,9 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. + crashkernel_high=size[KMG] + [KNL, x86_64] range could be above 4G. Allow kernel + to allocate physical memory region from top, so could + be above 4G if system have more than 4G ram installed. + Otherwise memory region will be allocated below 4G, if + available. + It will be ignored if crashkernel=X is specified. crashkernel_low=size[KMG] - [KNL, x86_64] range under 4G. When crashkernel= is - passed, kernel allocate physical memory region + [KNL, x86_64] range under 4G. When crashkernel_high= is + passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory. Kernel would @@ -613,6 +620,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This one let user to specify own low range under 4G for second kernel instead. 0: to disable low allocation. + It will be ignored when crashkernel_high=X is not used + or memory reserved is below 4G. cs89x0_dma= [HW,NET] Format: <dma> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 12349202cae7..a85a144f2052 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -507,11 +507,14 @@ static void __init memblock_x86_reserve_range_setup_data(void) /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. + * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_KERNEL_ADDR_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) +# define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) #else -# define CRASH_KERNEL_ADDR_MAX MAXMEM +# define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) +# define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM #endif static void __init reserve_crashkernel_low(void) @@ -525,6 +528,7 @@ static void __init reserve_crashkernel_low(void) int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); + /* crashkernel_low=YM */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { @@ -569,14 +573,22 @@ static void __init reserve_crashkernel(void) const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long total_mem; unsigned long long crash_size, crash_base; + bool high = false; int ret; total_mem = memblock_phys_mem_size(); + /* crashkernel=XM */ ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); - if (ret != 0 || crash_size <= 0) - return; + if (ret != 0 || crash_size <= 0) { + /* crashkernel_high=XM */ + ret = parse_crashkernel_high(boot_command_line, total_mem, + &crash_size, &crash_base); + if (ret != 0 || crash_size <= 0) + return; + high = true; + } /* 0 means: find the address automatically */ if (crash_base <= 0) { @@ -584,7 +596,9 @@ static void __init reserve_crashkernel(void) * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX */ crash_base = memblock_find_in_range(alignment, - CRASH_KERNEL_ADDR_MAX, crash_size, alignment); + high ? CRASH_KERNEL_ADDR_HIGH_MAX : + CRASH_KERNEL_ADDR_LOW_MAX, + crash_size, alignment); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d2e6927bbaae..d78d28a733b1 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -200,6 +200,8 @@ extern size_t vmcoreinfo_max_size; int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b6..1b2f73f5f9b9 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1422,6 +1422,15 @@ int __init parse_crashkernel(char *cmdline, "crashkernel="); } +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel_high="); +} + int __init parse_crashkernel_low(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, -- cgit v1.2.3 From adbc742bf78695bb98c79d18c558b61571748b99 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Mon, 15 Apr 2013 22:23:48 -0700 Subject: x86, kdump: Change crashkernel_high/low= to crashkernel=,high/low Per hpa, use crashkernel=X,high crashkernel=Y,low instead of crashkernel_hign=X crashkernel_low=Y. As that could be extensible. -v2: according to Vivek, change delimiter to ; -v3: let hign and low only handle simple form and it conforms to description in kernel-parameters.txt still keep crashkernel=X override any crashkernel=X,high crashkernel=Y,low -v4: update get_last_crashkernel returning and add more strict checking in parse_crashkernel_simple() found by HATAYAMA. -v5: Change delimiter back to , according to HPA. also separate parse_suffix from parse_simper according to vivek. so we can avoid @pos in that path. -v6: Tight the checking about crashkernel=X,highblahblah,high found by HTYAYAMA. Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Link: http://lkml.kernel.org/r/1366089828-19692-5-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> --- Documentation/kernel-parameters.txt | 10 ++-- arch/x86/kernel/setup.c | 6 +- kernel/kexec.c | 109 +++++++++++++++++++++++++++++++----- 3 files changed, 104 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 709eb3edc6b2..a1ac1f1d6d34 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -603,16 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted. a memory unit (amount[KMG]). See also Documentation/kdump/kdump.txt for an example. - crashkernel_high=size[KMG] + crashkernel=size[KMG],high [KNL, x86_64] range could be above 4G. Allow kernel to allocate physical memory region from top, so could be above 4G if system have more than 4G ram installed. Otherwise memory region will be allocated below 4G, if available. It will be ignored if crashkernel=X is specified. - crashkernel_low=size[KMG] - [KNL, x86_64] range under 4G. When crashkernel_high= is - passed, kernel could allocate physical memory region + crashkernel=size[KMG],low + [KNL, x86_64] range under 4G. When crashkernel=X,high + is passed, kernel could allocate physical memory region above 4G, that cause second kernel crash on system that require some amount of low memory, e.g. swiotlb requires at least 64M+32K low memory. Kernel would @@ -620,7 +620,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. This one let user to specify own low range under 4G for second kernel instead. 0: to disable low allocation. - It will be ignored when crashkernel_high=X is not used + It will be ignored when crashkernel=X,high is not used or memory reserved is below 4G. cs89x0_dma= [HW,NET] diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a85a144f2052..fae9134a2de9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -528,7 +528,7 @@ static void __init reserve_crashkernel_low(void) int ret; total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); - /* crashkernel_low=YM */ + /* crashkernel=Y,low */ ret = parse_crashkernel_low(boot_command_line, total_low_mem, &low_size, &base); if (ret != 0) { @@ -542,7 +542,7 @@ static void __init reserve_crashkernel_low(void) low_size = swiotlb_size_or_default() + (8UL<<20); auto_set = true; } else { - /* passed with crashkernel_low=0 ? */ + /* passed with crashkernel=0,low ? */ if (!low_size) return; } @@ -582,7 +582,7 @@ static void __init reserve_crashkernel(void) ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) { - /* crashkernel_high=XM */ + /* crashkernel=X,high */ ret = parse_crashkernel_high(boot_command_line, total_mem, &crash_size, &crash_base); if (ret != 0 || crash_size <= 0) diff --git a/kernel/kexec.c b/kernel/kexec.c index 1b2f73f5f9b9..401fdb041f35 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1368,35 +1368,114 @@ static int __init parse_crashkernel_simple(char *cmdline, return 0; } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - const char *name) + const char *name, + const char *suffix) { - char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; + char *ck_cmdline; BUG_ON(!crash_size || !crash_base); *crash_size = 0; *crash_base = 0; - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - ck_cmdline = p; - p = strstr(p+1, name); - } + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); if (!ck_cmdline) return -EINVAL; ck_cmdline += strlen(name); + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + crash_base, suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -1413,13 +1492,17 @@ static int __init __parse_crashkernel(char *cmdline, return 0; } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel="); + "crashkernel=", NULL); } int __init parse_crashkernel_high(char *cmdline, @@ -1428,7 +1511,7 @@ int __init parse_crashkernel_high(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_high="); + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); } int __init parse_crashkernel_low(char *cmdline, @@ -1437,7 +1520,7 @@ int __init parse_crashkernel_low(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_low="); + "crashkernel=", suffix_tbl[SUFFIX_LOW]); } static void update_vmcoreinfo_note(void) -- cgit v1.2.3 From 157752d84f5df47e01577970f9c5f61a0b9f4546 Mon Sep 17 00:00:00 2001 From: Yinghai Lu <yinghai@kernel.org> Date: Mon, 15 Apr 2013 22:23:46 -0700 Subject: kexec: use Crash kernel for Crash kernel low We can extend kexec-tools to support multiple "Crash kernel" in /proc/iomem instead. So we can use "Crash kernel" instead of "Crash kernel low" in /proc/iomem. Suggested-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Yinghai Lu <yinghai@kernel.org> Link: http://lkml.kernel.org/r/1366089828-19692-3-git-send-email-yinghai@kernel.org Acked-by: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> --- kernel/kexec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 401fdb041f35..ffd4e111fd67 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -55,7 +55,7 @@ struct resource crashk_res = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; struct resource crashk_low_res = { - .name = "Crash kernel low", + .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM -- cgit v1.2.3 From b9e146d8eb3b9ecae5086d373b50fa0c1f3e7f0f Mon Sep 17 00:00:00 2001 From: Emese Revfy <re.emese@gmail.com> Date: Wed, 17 Apr 2013 15:58:36 -0700 Subject: kernel/signal.c: stop info leak via the tkill and the tgkill syscalls This fixes a kernel memory contents leak via the tkill and tgkill syscalls for compat processes. This is visible in the siginfo_t->_sifields._rt.si_sigval.sival_ptr field when handling signals delivered from tkill. The place of the infoleak: int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from) { ... put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); ... } Signed-off-by: Emese Revfy <re.emese@gmail.com> Reviewed-by: PaX Team <pageexec@freemail.hu> Signed-off-by: Kees Cook <keescook@chromium.org> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Oleg Nesterov <oleg@redhat.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serge.hallyn@canonical.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index dd72567767d9..598dc06be421 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2948,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info; + struct siginfo info = {}; info.si_signo = sig; info.si_errno = 0; -- cgit v1.2.3 From 5c51543b0ae45967cfa99ca16748dc72888cc265 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> Date: Thu, 18 Apr 2013 18:33:18 +0900 Subject: kprobes: Fix a double lock bug of kprobe_mutex Fix a double locking bug caused when debug.kprobe-optimization=0. While the proc_kprobes_optimization_handler locks kprobe_mutex, wait_for_kprobe_optimizer locks it again and that causes a double lock. To fix the bug, this introduces different mutex for protecting sysctl parameter and locks it in proc_kprobes_optimization_handler. Of course, since we need to lock kprobe_mutex when touching kprobes resources, that is done in *optimize_all_kprobes(). This bug was introduced by commit ad72b3bea744 ("kprobes: fix wait_for_kprobe_optimizer()") Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com> Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Tejun Heo <tj@kernel.org> Cc: "David S. Miller" <davem@davemloft.net> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/kprobes.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e35be53f6613..3fed7f0cbcdf 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -794,16 +794,16 @@ out: } #ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already allowed, just return */ if (kprobes_allow_optimization) - return; + goto out; kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void) optimize_kprobe(p); } printk(KERN_INFO "Kprobes globally optimized\n"); +out: + mutex_unlock(&kprobe_mutex); } -/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already prohibited, just return */ - if (!kprobes_allow_optimization) + if (!kprobes_allow_optimization) { + mutex_unlock(&kprobe_mutex); return; + } kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + mutex_unlock(&kprobe_mutex); + /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); printk(KERN_INFO "Kprobes globally unoptimized\n"); } +static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, @@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, { int ret; - mutex_lock(&kprobe_mutex); + mutex_lock(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, optimize_all_kprobes(); else unoptimize_all_kprobes(); - mutex_unlock(&kprobe_mutex); + mutex_unlock(&kprobe_sysctl_mutex); return ret; } -- cgit v1.2.3 From 0a82a8d132b26d438eb90b3ab35a7016e7227a1d Mon Sep 17 00:00:00 2001 From: Linus Torvalds <torvalds@linux-foundation.org> Date: Thu, 18 Apr 2013 09:00:26 -0700 Subject: Revert "block: add missing block_bio_complete() tracepoint" This reverts commit 3a366e614d0837d9fc23f78cdb1a1186ebc3387f. Wanlong Gao reports that it causes a kernel panic on his machine several minutes after boot. Reverting it removes the panic. Jens says: "It's not quite clear why that is yet, so I think we should just revert the commit for 3.9 final (which I'm assuming is pretty close). The wifi is crap at the LSF hotel, so sending this email instead of queueing up a revert and pull request." Reported-by: Wanlong Gao <gaowanlong@cn.fujitsu.com> Requested-by: Jens Axboe <axboe@kernel.dk> Cc: Tejun Heo <tj@kernel.org> Cc: Steven Rostedt <rostedt@goodmis.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- block/blk-core.c | 1 + drivers/md/dm.c | 1 + drivers/md/raid5.c | 11 ++++++++++- fs/bio.c | 2 -- include/linux/blktrace_api.h | 1 - include/trace/events/block.h | 8 ++++---- kernel/trace/blktrace.c | 26 +++----------------------- 7 files changed, 19 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index 074b758efc42..7c288358a745 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -39,6 +39,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7e469260fe5e..9a0bdad9ad8f 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -611,6 +611,7 @@ static void dec_pending(struct dm_io *io, int error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ + trace_block_bio_complete(md->queue, bio, io_error); bio_endio(bio, io_error); } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 24909eb13fec..f4e87bfc7567 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); bi = return_bi; } @@ -3914,6 +3916,8 @@ static void raid5_align_endio(struct bio *bi, int error) rdev_dec_pending(rdev, conf->mddev); if (!error && uptodate) { + trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev), + raid_bi, 0); bio_endio(raid_bi, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); @@ -4382,6 +4386,8 @@ static void make_request(struct mddev *mddev, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); + trace_block_bio_complete(bdev_get_queue(bi->bi_bdev), + bi, 0); bio_endio(bi, 0); } } @@ -4758,8 +4764,11 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) handled++; } remaining = raid5_dec_bi_active_stripes(raid_bio); - if (remaining == 0) + if (remaining == 0) { + trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev), + raid_bio, 0); bio_endio(raid_bio, 0); + } if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; diff --git a/fs/bio.c b/fs/bio.c index bb5768f59b32..b96fc6ce4855 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1428,8 +1428,6 @@ void bio_endio(struct bio *bio, int error) else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) error = -EIO; - trace_block_bio_complete(bio, error); - if (bio->bi_end_io) bio->bi_end_io(bio, error); } diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 0ea61e07a91c..7c2e030e72f1 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -12,7 +12,6 @@ struct blk_trace { int trace_state; - bool rq_based; struct rchan *rchan; unsigned long __percpu *sequence; unsigned char __percpu *msg_data; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 9961726523d0..9c1467357b03 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -257,6 +257,7 @@ TRACE_EVENT(block_bio_bounce, /** * block_bio_complete - completed all work on the block operation + * @q: queue holding the block operation * @bio: block operation completed * @error: io error value * @@ -265,9 +266,9 @@ TRACE_EVENT(block_bio_bounce, */ TRACE_EVENT(block_bio_complete, - TP_PROTO(struct bio *bio, int error), + TP_PROTO(struct request_queue *q, struct bio *bio, int error), - TP_ARGS(bio, error), + TP_ARGS(q, bio, error), TP_STRUCT__entry( __field( dev_t, dev ) @@ -278,8 +279,7 @@ TRACE_EVENT(block_bio_complete, ), TP_fast_assign( - __entry->dev = bio->bi_bdev ? - bio->bi_bdev->bd_dev : 0; + __entry->dev = bio->bi_bdev->bd_dev; __entry->sector = bio->bi_sector; __entry->nr_sector = bio->bi_size >> 9; __entry->error = error; diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272eec..5a0f781cd729 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { - struct blk_trace *bt = q->blk_trace; - - /* if control ever passes through here, it's a request based driver */ - if (unlikely(bt && !bt->rq_based)) - bt->rq_based = true; - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio, + int error) { - struct request_queue *q; - struct blk_trace *bt; - - if (!bio->bi_bdev) - return; - - q = bdev_get_queue(bio->bi_bdev); - bt = q->blk_trace; - - /* - * Request based drivers will generate both rq and bio completions. - * Ignore bio ones. - */ - if (likely(!bt) || bt->rq_based) - return; - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } -- cgit v1.2.3 From 712317ad97f41e738e1a19aa0a6392a78a84094e Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Thu, 18 Apr 2013 23:09:52 -0700 Subject: cgroup: fix broken file xattrs We should store file xattrs in struct cfent instead of struct cftype, because cftype is a type while cfent is object instance of cftype. For example each cgroup has a tasks file, and each tasks file is associated with a uniq cfent, but all those files share the same struct cftype. Alexey Kodanev reported a crash, which can be reproduced: # mount -t cgroup -o xattr /sys/fs/cgroup # mkdir /sys/fs/cgroup/test # setfattr -n trusted.value -v test_value /sys/fs/cgroup/tasks # rmdir /sys/fs/cgroup/test # umount /sys/fs/cgroup oops! In this case, simple_xattrs_free() will free the same struct simple_xattrs twice. tj: Dropped unused local variable @cft from cgroup_diput(). Cc: <stable@vger.kernel.org> # 3.8.x Reported-by: Alexey Kodanev <alexey.kodanev@oracle.com> Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- include/linux/cgroup.h | 3 --- kernel/cgroup.c | 11 ++++++----- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cda7eb2239e1..c371888298d5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -422,9 +422,6 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; - /* file xattrs */ - struct simple_xattrs xattrs; - int (*open)(struct inode *inode, struct file *file); ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, struct file *file, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c16719e056ab..192d762ec1f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -117,6 +117,9 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + + /* file xattrs */ + struct simple_xattrs xattrs; }; /* @@ -882,13 +885,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) } else { struct cfent *cfe = __d_cfe(dentry); struct cgroup *cgrp = dentry->d_parent->d_fsdata; - struct cftype *cft = cfe->type; WARN_ONCE(!list_empty(&cfe->node) && cgrp != &cgrp->root->top_cgroup, "cfe still linked for %s\n", cfe->type->name); + simple_xattrs_free(&cfe->xattrs); kfree(cfe); - simple_xattrs_free(&cft->xattrs); } iput(inode); } @@ -2501,7 +2503,7 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry) if (S_ISDIR(dentry->d_inode->i_mode)) return &__d_cgrp(dentry)->xattrs; else - return &__d_cft(dentry)->xattrs; + return &__d_cfe(dentry)->xattrs; } static inline int xattr_enabled(struct dentry *dentry) @@ -2677,8 +2679,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - simple_xattrs_init(&cft->xattrs); - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, subsys->name); strcat(name, "."); @@ -2703,6 +2703,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, cfe->type = (void *)cft; cfe->dentry = dentry; dentry->d_fsdata = cfe; + simple_xattrs_init(&cfe->xattrs); list_add_tail(&cfe->node, &parent->files); cfe = NULL; } -- cgit v1.2.3 From 41fcb9f230bf773656d1768b73000ef720bf00c3 Mon Sep 17 00:00:00 2001 From: Waiman Long <Waiman.Long@hp.com> Date: Wed, 17 Apr 2013 15:23:11 -0400 Subject: mutex: Move mutex spinning code from sched/core.c back to mutex.c As mentioned by Ingo, the SCHED_FEAT_OWNER_SPIN scheduler feature bit was really just an early hack to make with/without mutex-spinning testable. So it is no longer necessary. This patch removes the SCHED_FEAT_OWNER_SPIN feature bit and move the mutex spinning code from kernel/sched/core.c back to kernel/mutex.c which is where they should belong. Signed-off-by: Waiman Long <Waiman.Long@hp.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Chandramouleeswaran Aswin <aswin@hp.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Norton Scott J <scott.norton@hp.com> Cc: Rik van Riel <riel@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: David Howells <dhowells@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366226594-5506-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- include/linux/sched.h | 1 - kernel/mutex.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 45 --------------------------------------------- kernel/sched/features.h | 7 ------- 4 files changed, 46 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..aefe45d79f53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -320,7 +320,6 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e0..262d7177adad 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -95,6 +95,52 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} +#endif + static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..b37a22b99e0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2997,51 +2997,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,13 +45,6 @@ SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) -/* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - /* * Decrement CPU power based on time not spent running tasks */ -- cgit v1.2.3 From 0dc8c730c98a06a4d927f8d08bd0dd6de973b8dd Mon Sep 17 00:00:00 2001 From: Waiman Long <Waiman.Long@hp.com> Date: Wed, 17 Apr 2013 15:23:12 -0400 Subject: mutex: Make more scalable by doing less atomic operations In the __mutex_lock_common() function, an initial entry into the lock slow path will cause two atomic_xchg instructions to be issued. Together with the atomic decrement in the fast path, a total of three atomic read-modify-write instructions will be issued in rapid succession. This can cause a lot of cache bouncing when many tasks are trying to acquire the mutex at the same time. This patch will reduce the number of atomic_xchg instructions used by checking the counter value first before issuing the instruction. The atomic_read() function is just a simple memory read. The atomic_xchg() function, on the other hand, can be up to 2 order of magnitude or even more in cost when compared with atomic_read(). By using atomic_read() to check the value first before calling atomic_xchg(), we can avoid a lot of unnecessary cache coherency traffic. The only downside with this change is that a task on the slow path will have a tiny bit less chance of getting the mutex when competing with another task in the fast path. The same is true for the atomic_cmpxchg() function in the mutex-spin-on-owner loop. So an atomic_read() is also performed before calling atomic_cmpxchg(). The mutex locking and unlocking code for the x86 architecture can allow any negative number to be used in the mutex count to indicate that some tasks are waiting for the mutex. I am not so sure if that is the case for the other architectures. So the default is to avoid atomic_xchg() if the count has already been set to -1. For x86, the check is modified to include all negative numbers to cover a larger case. The following table shows the jobs per minutes (JPM) scalability data on an 8-node 80-core Westmere box with a 3.7.10 kernel. The numactl command is used to restrict the running of the high_systime workloads to 1/2/4/8 nodes with hyperthreading on and off. +-----------------+-----------+------------+----------+ | Configuration | Mean JPM | Mean JPM | % Change | | | w/o patch | with patch | | +-----------------+-----------------------------------+ | | User Range 1100 - 2000 | +-----------------+-----------------------------------+ | 8 nodes, HT on | 36980 | 148590 | +301.8% | | 8 nodes, HT off | 42799 | 145011 | +238.8% | | 4 nodes, HT on | 61318 | 118445 | +51.1% | | 4 nodes, HT off | 158481 | 158592 | +0.1% | | 2 nodes, HT on | 180602 | 173967 | -3.7% | | 2 nodes, HT off | 198409 | 198073 | -0.2% | | 1 node , HT on | 149042 | 147671 | -0.9% | | 1 node , HT off | 126036 | 126533 | +0.4% | +-----------------+-----------------------------------+ | | User Range 200 - 1000 | +-----------------+-----------------------------------+ | 8 nodes, HT on | 41525 | 122349 | +194.6% | | 8 nodes, HT off | 49866 | 124032 | +148.7% | | 4 nodes, HT on | 66409 | 106984 | +61.1% | | 4 nodes, HT off | 119880 | 130508 | +8.9% | | 2 nodes, HT on | 138003 | 133948 | -2.9% | | 2 nodes, HT off | 132792 | 131997 | -0.6% | | 1 node , HT on | 116593 | 115859 | -0.6% | | 1 node , HT off | 104499 | 104597 | +0.1% | +-----------------+------------+-----------+----------+ At low user range 10-100, the JPM differences were within +/-1%. So they are not that interesting. AIM7 benchmark run has a pretty large run-to-run variance due to random nature of the subtests executed. So a difference of less than +-5% may not be really significant. This patch improves high_systime workload performance at 4 nodes and up by maintaining transaction rates without significant drop-off at high node count. The patch has practically no impact on 1 and 2 nodes system. The table below shows the percentage time (as reported by perf record -a -s -g) spent on the __mutex_lock_slowpath() function by the high_systime workload at 1500 users for 2/4/8-node configurations with hyperthreading off. +---------------+-----------------+------------------+---------+ | Configuration | %Time w/o patch | %Time with patch | %Change | +---------------+-----------------+------------------+---------+ | 8 nodes | 65.34% | 0.69% | -99% | | 4 nodes | 8.70% | 1.02% | -88% | | 2 nodes | 0.41% | 0.32% | -22% | +---------------+-----------------+------------------+---------+ It is obvious that the dramatic performance improvement at 8 nodes was due to the drastic cut in the time spent within the __mutex_lock_slowpath() function. The table below show the improvements in other AIM7 workloads (at 8 nodes, hyperthreading off). +--------------+---------------+----------------+-----------------+ | Workload | mean % change | mean % change | mean % change | | | 10-100 users | 200-1000 users | 1100-2000 users | +--------------+---------------+----------------+-----------------+ | alltests | +0.6% | +104.2% | +185.9% | | five_sec | +1.9% | +0.9% | +0.9% | | fserver | +1.4% | -7.7% | +5.1% | | new_fserver | -0.5% | +3.2% | +3.1% | | shared | +13.1% | +146.1% | +181.5% | | short | +7.4% | +5.0% | +4.2% | +--------------+---------------+----------------+-----------------+ Signed-off-by: Waiman Long <Waiman.Long@hp.com> Reviewed-by: Davidlohr Bueso <davidlohr.bueso@hp.com> Reviewed-by: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Chandramouleeswaran Aswin <aswin@hp.com> Cc: Norton: Scott J <scott.norton@hp.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: David Howells <dhowells@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366226594-5506-3-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- arch/x86/include/asm/mutex.h | 10 ++++++++++ kernel/mutex.c | 19 ++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h index 7d3a48275394..bc2a0b0dcea6 100644 --- a/arch/x86/include/asm/mutex.h +++ b/arch/x86/include/asm/mutex.h @@ -3,3 +3,13 @@ #else # include <asm/mutex_64.h> #endif + +#ifndef __ASM_MUTEX_H +#define __ASM_MUTEX_H +/* + * For the x86 architecture, it allows any negative number (besides -1) in + * the mutex count to indicate that some other threads are waiting on the + * mutex. + */ +#define __ARCH_ALLOW_ANY_NEGATIVE_MUTEX_COUNT 1 +#endif diff --git a/kernel/mutex.c b/kernel/mutex.c index 262d7177adad..70ebd855d9e8 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -37,6 +37,17 @@ # include <asm/mutex.h> #endif +/* + * A mutex count of -1 indicates that waiters are sleeping waiting for the + * mutex. Some architectures can allow any negative number, not just -1, for + * this purpose. + */ +#ifdef __ARCH_ALLOW_ANY_NEGATIVE_MUTEX_COUNT +#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) +#else +#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) != -1) +#endif + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -217,7 +228,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if (owner && !mutex_spin_on_owner(lock, owner)) break; - if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + if ((atomic_read(&lock->count) == 1) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); preempt_enable(); @@ -251,7 +263,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) goto done; lock_contended(&lock->dep_map, ip); @@ -266,7 +278,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * that when we release the lock, we properly wake up the * other waiters: */ - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && + (atomic_xchg(&lock->count, -1) == 1)) break; /* -- cgit v1.2.3 From 2bd2c92cf07cc4a373bf316c75b78ac465fefd35 Mon Sep 17 00:00:00 2001 From: Waiman Long <Waiman.Long@hp.com> Date: Wed, 17 Apr 2013 15:23:13 -0400 Subject: mutex: Queue mutex spinners with MCS lock to reduce cacheline contention The current mutex spinning code (with MUTEX_SPIN_ON_OWNER option turned on) allow multiple tasks to spin on a single mutex concurrently. A potential problem with the current approach is that when the mutex becomes available, all the spinning tasks will try to acquire the mutex more or less simultaneously. As a result, there will be a lot of cacheline bouncing especially on systems with a large number of CPUs. This patch tries to reduce this kind of contention by putting the mutex spinners into a queue so that only the first one in the queue will try to acquire the mutex. This will reduce contention and allow all the tasks to move forward faster. The queuing of mutex spinners is done using an MCS lock based implementation which will further reduce contention on the mutex cacheline than a similar ticket spinlock based implementation. This patch will add a new field into the mutex data structure for holding the MCS lock. This expands the mutex size by 8 bytes for 64-bit system and 4 bytes for 32-bit system. This overhead will be avoid if the MUTEX_SPIN_ON_OWNER option is turned off. The following table shows the jobs per minute (JPM) scalability data on an 8-node 80-core Westmere box with a 3.7.10 kernel. The numactl command is used to restrict the running of the fserver workloads to 1/2/4/8 nodes with hyperthreading off. +-----------------+-----------+-----------+-------------+----------+ | Configuration | Mean JPM | Mean JPM | Mean JPM | % Change | | | w/o patch | patch 1 | patches 1&2 | 1->1&2 | +-----------------+------------------------------------------------+ | | User Range 1100 - 2000 | +-----------------+------------------------------------------------+ | 8 nodes, HT off | 227972 | 227237 | 305043 | +34.2% | | 4 nodes, HT off | 393503 | 381558 | 394650 | +3.4% | | 2 nodes, HT off | 334957 | 325240 | 338853 | +4.2% | | 1 node , HT off | 198141 | 197972 | 198075 | +0.1% | +-----------------+------------------------------------------------+ | | User Range 200 - 1000 | +-----------------+------------------------------------------------+ | 8 nodes, HT off | 282325 | 312870 | 332185 | +6.2% | | 4 nodes, HT off | 390698 | 378279 | 393419 | +4.0% | | 2 nodes, HT off | 336986 | 326543 | 340260 | +4.2% | | 1 node , HT off | 197588 | 197622 | 197582 | 0.0% | +-----------------+-----------+-----------+-------------+----------+ At low user range 10-100, the JPM differences were within +/-1%. So they are not that interesting. The fserver workload uses mutex spinning extensively. With just the mutex change in the first patch, there is no noticeable change in performance. Rather, there is a slight drop in performance. This mutex spinning patch more than recovers the lost performance and show a significant increase of +30% at high user load with the full 8 nodes. Similar improvements were also seen in a 3.8 kernel. The table below shows the %time spent by different kernel functions as reported by perf when running the fserver workload at 1500 users with all 8 nodes. +-----------------------+-----------+---------+-------------+ | Function | % time | % time | % time | | | w/o patch | patch 1 | patches 1&2 | +-----------------------+-----------+---------+-------------+ | __read_lock_failed | 34.96% | 34.91% | 29.14% | | __write_lock_failed | 10.14% | 10.68% | 7.51% | | mutex_spin_on_owner | 3.62% | 3.42% | 2.33% | | mspin_lock | N/A | N/A | 9.90% | | __mutex_lock_slowpath | 1.46% | 0.81% | 0.14% | | _raw_spin_lock | 2.25% | 2.50% | 1.10% | +-----------------------+-----------+---------+-------------+ The fserver workload for an 8-node system is dominated by the contention in the read/write lock. Mutex contention also plays a role. With the first patch only, mutex contention is down (as shown by the __mutex_lock_slowpath figure) which help a little bit. We saw only a few percents improvement with that. By applying patch 2 as well, the single mutex_spin_on_owner figure is now split out into an additional mspin_lock figure. The time increases from 3.42% to 11.23%. It shows a great reduction in contention among the spinners leading to a 30% improvement. The time ratio 9.9/2.33=4.3 indicates that there are on average 4+ spinners waiting in the spin_lock loop for each spinner in the mutex_spin_on_owner loop. Contention in other locking functions also go down by quite a lot. The table below shows the performance change of both patches 1 & 2 over patch 1 alone in other AIM7 workloads (at 8 nodes, hyperthreading off). +--------------+---------------+----------------+-----------------+ | Workload | mean % change | mean % change | mean % change | | | 10-100 users | 200-1000 users | 1100-2000 users | +--------------+---------------+----------------+-----------------+ | alltests | 0.0% | -0.8% | +0.6% | | five_sec | -0.3% | +0.8% | +0.8% | | high_systime | +0.4% | +2.4% | +2.1% | | new_fserver | +0.1% | +14.1% | +34.2% | | shared | -0.5% | -0.3% | -0.4% | | short | -1.7% | -9.8% | -8.3% | +--------------+---------------+----------------+-----------------+ The short workload is the only one that shows a decline in performance probably due to the spinner locking and queuing overhead. Signed-off-by: Waiman Long <Waiman.Long@hp.com> Reviewed-by: Davidlohr Bueso <davidlohr.bueso@hp.com> Acked-by: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Chandramouleeswaran Aswin <aswin@hp.com> Cc: Norton Scott J <scott.norton@hp.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: David Howells <dhowells@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366226594-5506-4-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- include/linux/mutex.h | 3 ++ kernel/mutex.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 93 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 9121595a8ebf..433da8a1a426 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -53,6 +53,9 @@ struct mutex { #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) struct task_struct *owner; #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + void *spin_mlock; /* Spinner MCS lock */ +#endif #ifdef CONFIG_DEBUG_MUTEXES const char *name; void *magic; diff --git a/kernel/mutex.c b/kernel/mutex.c index 70ebd855d9e8..1dbd4210baef 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -55,6 +55,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + lock->spin_mlock = NULL; +#endif debug_mutex_init(lock, name, key); } @@ -107,6 +110,60 @@ EXPORT_SYMBOL(mutex_lock); #endif #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { + struct mspin_node *next ; + int locked; /* 1 if lock acquired */ +}; +#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* Lock acquired */ + node->locked = 1; + return; + } + ACCESS_ONCE(prev->next) = node; + smp_wmb(); + /* Wait until the lock holder passes the lock down */ + while (!ACCESS_ONCE(node->locked)) + arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (cmpxchg(lock, node, NULL) == node) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + ACCESS_ONCE(next->locked) = 1; + smp_wmb(); +} + /* * Mutex spinning code migrated from kernel/sched/core.c */ @@ -150,6 +207,24 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ return lock->owner == NULL; } + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ + int retval = 1; + + rcu_read_lock(); + if (lock->owner) + retval = lock->owner->on_cpu; + rcu_read_unlock(); + /* + * if lock->owner is not set, the mutex owner may have just acquired + * it and not set the owner yet or the mutex has been released. + */ + return retval; +} #endif static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); @@ -215,26 +290,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * * We can't do this for DEBUG_MUTEXES because that relies on wait_lock * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. */ + if (!mutex_can_spin_on_owner(lock)) + goto slowpath; for (;;) { struct task_struct *owner; + struct mspin_node node; /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ + mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) + if (owner && !mutex_spin_on_owner(lock, owner)) { + mspin_unlock(MLOCK(lock), &node); break; + } if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); + mspin_unlock(MLOCK(lock), &node); preempt_enable(); return 0; } + mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -253,6 +341,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } +slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From cc189d2513d1f45cde87a9043fe3be28559c7490 Mon Sep 17 00:00:00 2001 From: Waiman Long <Waiman.Long@hp.com> Date: Wed, 17 Apr 2013 15:23:14 -0400 Subject: mutex: Back out architecture specific check for negative mutex count Linus suggested that probably all the supported architectures can allow a negative mutex count without incorrect behavior, so we can then back out the architecture specific change and allow the mutex count to go to any negative number. That should further reduce contention for non-x86 architecture. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Waiman Long <Waiman.Long@hp.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Chandramouleeswaran Aswin <aswin@hp.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Norton Scott J <scott.norton@hp.com> Cc: Rik van Riel <riel@redhat.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: David Howells <dhowells@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: Clark Williams <williams@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366226594-5506-5-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- arch/x86/include/asm/mutex.h | 10 ---------- kernel/mutex.c | 9 ++------- 2 files changed, 2 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h index bc2a0b0dcea6..7d3a48275394 100644 --- a/arch/x86/include/asm/mutex.h +++ b/arch/x86/include/asm/mutex.h @@ -3,13 +3,3 @@ #else # include <asm/mutex_64.h> #endif - -#ifndef __ASM_MUTEX_H -#define __ASM_MUTEX_H -/* - * For the x86 architecture, it allows any negative number (besides -1) in - * the mutex count to indicate that some other threads are waiting on the - * mutex. - */ -#define __ARCH_ALLOW_ANY_NEGATIVE_MUTEX_COUNT 1 -#endif diff --git a/kernel/mutex.c b/kernel/mutex.c index 1dbd4210baef..ad53a664f113 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -38,15 +38,10 @@ #endif /* - * A mutex count of -1 indicates that waiters are sleeping waiting for the - * mutex. Some architectures can allow any negative number, not just -1, for - * this purpose. + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. */ -#ifdef __ARCH_ALLOW_ANY_NEGATIVE_MUTEX_COUNT #define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) -#else -#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) != -1) -#endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) -- cgit v1.2.3 From 4c69e6ea415a35eb7f0fc8ee9390c8f7436492a2 Mon Sep 17 00:00:00 2001 From: Sahara <keun-o.park@windriver.com> Date: Mon, 15 Apr 2013 11:13:15 +0900 Subject: tracepoints: Prevent null probe from being added Somehow tracepoint_entry_add_probe() function allows a null probe function. And, this may lead to unexpected results since the number of probe functions in an entry can be counted by checking whether a probe is null or not in the for-loop. This patch prevents a null probe from being added. In tracepoint_entry_remove_probe() function, checking probe parameter within the for-loop is moved out for code efficiency, leaving the null probe feature which removes all probe functions in the entry. Link: http://lkml.kernel.org/r/1365991995-19445-1-git-send-email-kpark3469@gmail.com Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Acked-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Sahara <keun-o.park@windriver.com> Signed-off-by: Steven Rostedt <rostedt@goodmis.org> --- kernel/tracepoint.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d96ba22dabfa..99e7e314e451 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, int nr_probes = 0; struct tracepoint_func *old, *new; - WARN_ON(!probe); + if (WARN_ON(!probe)) + return ERR_PTR(-EINVAL); debug_print_probes(entry); old = entry->funcs; @@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (!probe || - (old[nr_probes].func == probe && - old[nr_probes].data == data)) - nr_del++; + if (probe) { + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if (old[nr_probes].func == probe && + old[nr_probes].data == data) + nr_del++; + } } + /* + * If probe is NULL, then nr_probes = nr_del = 0, and then the + * entire entry will be removed. + */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ entry->funcs = NULL; @@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i].func; i++) - if (probe && - (old[i].func != probe || old[i].data != data)) + if (old[i].func != probe || old[i].data != data) new[j++] = old[i]; new[nr_probes - nr_del].func = NULL; entry->refcount = nr_probes - nr_del; -- cgit v1.2.3 From c79aa0d96548aee50570209eb2d45c8f4ac49230 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Date: Fri, 19 Apr 2013 12:01:24 -0700 Subject: events: Protect access via task_subsys_state_check() The following RCU splat indicates lack of RCU protection: [ 953.267649] =============================== [ 953.267652] [ INFO: suspicious RCU usage. ] [ 953.267657] 3.9.0-0.rc6.git2.4.fc19.ppc64p7 #1 Not tainted [ 953.267661] ------------------------------- [ 953.267664] include/linux/cgroup.h:534 suspicious rcu_dereference_check() usage! [ 953.267669] [ 953.267669] other info that might help us debug this: [ 953.267669] [ 953.267675] [ 953.267675] rcu_scheduler_active = 1, debug_locks = 0 [ 953.267680] 1 lock held by glxgears/1289: [ 953.267683] #0: (&sig->cred_guard_mutex){+.+.+.}, at: [<c00000000027f884>] .prepare_bprm_creds+0x34/0xa0 [ 953.267700] [ 953.267700] stack backtrace: [ 953.267704] Call Trace: [ 953.267709] [c0000001f0d1b6e0] [c000000000016e30] .show_stack+0x130/0x200 (unreliable) [ 953.267717] [c0000001f0d1b7b0] [c0000000001267f8] .lockdep_rcu_suspicious+0x138/0x180 [ 953.267724] [c0000001f0d1b840] [c0000000001d43a4] .perf_event_comm+0x4c4/0x690 [ 953.267731] [c0000001f0d1b950] [c00000000027f6e4] .set_task_comm+0x84/0x1f0 [ 953.267737] [c0000001f0d1b9f0] [c000000000280414] .setup_new_exec+0x94/0x220 [ 953.267744] [c0000001f0d1ba70] [c0000000002f665c] .load_elf_binary+0x58c/0x19b0 ... This commit therefore adds the required RCU read-side critical section to perf_event_comm(). Reported-by: Adam Jackson <ajax@redhat.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: acme@ghostprotocols.net Link: http://lkml.kernel.org/r/20130419190124.GA8638@linux.vnet.ibm.com Signed-off-by: Ingo Molnar <mingo@kernel.org> Tested-by: Gustavo Luiz Duarte <gusld@br.ibm.com> --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4d3124b39277..9fcb0944f071 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4596,6 +4596,7 @@ void perf_event_comm(struct task_struct *task) struct perf_event_context *ctx; int ctxn; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (!ctx) @@ -4603,6 +4604,7 @@ void perf_event_comm(struct task_struct *task) perf_event_enable_on_exec(ctx); } + rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; -- cgit v1.2.3 From 642dbc39ab1ea00f47e0fee1b8e8a27da036d940 Mon Sep 17 00:00:00 2001 From: Vincent Guittot <vincent.guittot@linaro.org> Date: Thu, 18 Apr 2013 18:34:26 +0200 Subject: sched: Fix wrong rq's runnable_avg update with rt tasks The current update of the rq's load can be erroneous when RT tasks are involved. The update of the load of a rq that becomes idle, is done only if the avg_idle is less than sysctl_sched_migration_cost. If RT tasks and short idle duration alternate, the runnable_avg will not be updated correctly and the time will be accounted as idle time when a CFS task wakes up. A new idle_enter function is called when the next task is the idle function so the elapsed time will be accounted as run time in the load of the rq, whatever the average idle time is. The function update_rq_runnable_avg is removed from idle_balance. When a RT task is scheduled on an idle CPU, the update of the rq's load is not done when the rq exit idle state because CFS's functions are not called. Then, the idle_balance, which is called just before entering the idle function, updates the rq's load and makes the assumption that the elapsed time since the last update, was only running time. As a consequence, the rq's load of a CPU that only runs a periodic RT task, is close to LOAD_AVG_MAX whatever the running duration of the RT task is. A new idle_exit function is called when the prev task is the idle function so the elapsed time will be accounted as idle time in the rq's load. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Steven Rostedt <rostedt@goodmis.org> Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: pjt@google.com Cc: fweisbec@gmail.com Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366302867-5055-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 23 +++++++++++++++++++++-- kernel/sched/idle_task.c | 16 ++++++++++++++++ kernel/sched/sched.h | 12 ++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 155783b4e4bf..1c977350e322 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -5217,8 +5238,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8116cf8e350f..605426a63588 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1024,6 +1024,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) -- cgit v1.2.3 From f83b2933667b923d4e30ee04720e160438e26047 Mon Sep 17 00:00:00 2001 From: Rusty Russell <rusty@rustcorp.com.au> Date: Mon, 22 Apr 2013 18:51:50 +0930 Subject: kernel/hz.bc: ignore. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/.gitignore | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f437..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -4,3 +4,4 @@ config_data.h config_data.gz timeconst.h +hz.bc -- cgit v1.2.3 From f1cd0858100c67273f2c74344e0c464344c4a982 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@lge.com> Date: Tue, 23 Apr 2013 17:27:37 +0900 Subject: sched: Change position of resched_cpu() in load_balance() cur_ld_moved is reset if env.flags hit LBF_NEED_BREAK. So, there is possibility that we miss doing resched_cpu(). Correct it as changing position of resched_cpu() before checking LBF_NEED_BREAK. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Tested-by: Jason Low <jason.low2@hp.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366705662-3587-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c977350e322..25aaf93281de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5080,17 +5080,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group -- cgit v1.2.3 From de5eb2dd7f171ee8a45d23cd41aa2efe9ab922b3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@lge.com> Date: Tue, 23 Apr 2013 17:27:38 +0900 Subject: sched: Explicitly cpu_idle_type checking in rebalance_domains() After commit 88b8dac0, dst-cpu can be changed in load_balance(), then we can't know cpu_idle_type of dst-cpu when load_balance() return positive. So, add explicit cpu_idle_type checking. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Tested-by: Jason Low <jason.low2@hp.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366705662-3587-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25aaf93281de..726e12905725 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5523,10 +5523,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } -- cgit v1.2.3 From cfc03118047172f5bdc58d63c607d16d33ce5305 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@lge.com> Date: Tue, 23 Apr 2013 17:27:39 +0900 Subject: sched: Don't consider other cpus in our group in case of NEWLY_IDLE Commit 88b8dac0 makes load_balance() consider other cpus in its group, regardless of idle type. When we do NEWLY_IDLE balancing, we should not consider it, because a motivation of NEWLY_IDLE balancing is to turn this cpu to non idle state if needed. This is not the case of other cpus. So, change code not to consider other cpus for NEWLY_IDLE balancing. With this patch, assign 'if (pulled_task) this_rq->idle_stamp = 0' in idle_balance() is corrected, because NEWLY_IDLE balancing doesn't consider other cpus. Assigning to 'this_rq->idle_stamp' is now valid. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Tested-by: Jason Low <jason.low2@hp.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366705662-3587-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 726e12905725..dfa92b7b3dec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5026,8 +5026,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) { + env.dst_grpmask = NULL; + /* + * we don't care max_lb_iterations in this case, + * in following patch, this will be removed + */ + max_lb_iterations = 0; + } else + max_lb_iterations = cpumask_weight(env.dst_grpmask); + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); -- cgit v1.2.3 From d31980846f9688db3ee3e5863525c6ff8ace4c7c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@lge.com> Date: Tue, 23 Apr 2013 17:27:40 +0900 Subject: sched: Move up affinity check to mitigate useless redoing overhead Currently, LBF_ALL_PINNED is cleared after affinity check is passed. So, if task migration is skipped by small load value or small imbalance value in move_tasks(), we don't clear LBF_ALL_PINNED. At last, we trigger 'redo' in load_balance(). Imbalance value is often so small that any tasks cannot be moved to other cpus and, of course, this situation may be continued after we change the target cpu. So this patch move up affinity check code and clear LBF_ALL_PINNED before evaluating load value in order to mitigate useless redoing overhead. In addition, re-order some comments correctly. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Tested-by: Jason Low <jason.low2@hp.com> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366705662-3587-5-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfa92b7b3dec..b8ef321641df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3896,10 +3896,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int new_dst_cpu; @@ -3967,9 +3971,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4021,7 +4022,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4032,9 +4033,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; -- cgit v1.2.3 From e6252c3ef4b9cd251b53f7b68035f395d20b044e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@lge.com> Date: Tue, 23 Apr 2013 17:27:41 +0900 Subject: sched: Rename load_balance_tmpmask to load_balance_mask This name doesn't represent specific meaning. So rename it to imply it's purpose. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Tested-by: Jason Low <jason.low2@hp.com> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366705662-3587-6-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee8c1bd703fe..cb49b2ab0e16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6873,7 +6873,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6910,7 +6910,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8ef321641df..5b1e96687b49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4977,7 +4977,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -5012,7 +5012,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, -- cgit v1.2.3 From e02e60c109ca70935bad1131976bdbf5160cf576 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <iamjoonsoo.kim@lge.com> Date: Tue, 23 Apr 2013 17:27:42 +0900 Subject: sched: Prevent to re-select dst-cpu in load_balance() Commit 88b8dac0 makes load_balance() consider other cpus in its group. But, in that, there is no code for preventing to re-select dst-cpu. So, same dst-cpu can be selected over and over. This patch add functionality to load_balance() in order to exclude cpu which is selected once. We prevent to re-select dst_cpu via env's cpus, so now, env's cpus is a candidate not only for src_cpus, but also dst_cpus. With this patch, we can remove lb_iterations and max_lb_iterations, because we decide whether we can go ahead or not via env's cpus. Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Tested-by: Jason Low <jason.low2@hp.com> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Davidlohr Bueso <davidlohr.bueso@hp.com> Cc: Peter Zijlstra <peterz@infradead.org> Link: http://lkml.kernel.org/r/1366705662-3587-7-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/sched/fair.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b1e96687b49..acaf567a03d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3905,7 +3905,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3920,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -5008,7 +5011,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5028,15 +5030,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * For NEWLY_IDLE load_balancing, we don't need to consider * other cpus in our group */ - if (idle == CPU_NEWLY_IDLE) { + if (idle == CPU_NEWLY_IDLE) env.dst_grpmask = NULL; - /* - * we don't care max_lb_iterations in this case, - * in following patch, this will be removed - */ - max_lb_iterations = 0; - } else - max_lb_iterations = cpumask_weight(env.dst_grpmask); cpumask_copy(cpus, cpu_active_mask); @@ -5064,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5121,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. -- cgit v1.2.3 From 199e371f59d31c828345b0d959d27d752827b517 Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Tue, 23 Apr 2013 12:34:03 -0400 Subject: lockdep: Print out additional debugging advice when we hit lockdep BUGs We occasionally get reports of these BUGs being hit, and the stack trace doesn't necessarily always tell us what we need to know about why we are hitting those limits. If users start attaching /proc/lock_stats to reports we may have more of a clue what's going on. Signed-off-by: Dave Jones <davej@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20130423163403.GA12839@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/lockdep.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e5deddadeab1..c5d1e6bbdb3e 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -411,6 +411,7 @@ static int save_trace(struct stack_trace *trace) printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); printk("turning off the locking correctness validator.\n"); + printk("Attach output of /proc/lock_stat to bug report\n"); dump_stack(); return 0; @@ -765,6 +766,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); printk("turning off the locking correctness validator.\n"); + printk("Attach output of /proc/lock_stat to bug report\n"); dump_stack(); return NULL; } @@ -836,6 +838,7 @@ static struct lock_list *alloc_list_entry(void) printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); printk("turning off the locking correctness validator.\n"); + printk("Attach output of /proc/lock_stat to bug report\n"); dump_stack(); return NULL; } @@ -2050,6 +2053,7 @@ cache_hit: printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); printk("turning off the locking correctness validator.\n"); + printk("Attach output of /proc/lock_stat to bug report\n"); dump_stack(); return 0; } @@ -3191,6 +3195,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); printk("turning off the locking correctness validator.\n"); + printk("Attach output of /proc/lock_stat to bug report\n"); lockdep_print_held_locks(current); debug_show_all_locks(); -- cgit v1.2.3 From 2c522836627c6e78660f8bd52cdb4cdcb75e3e3c Mon Sep 17 00:00:00 2001 From: Dave Jones <davej@redhat.com> Date: Thu, 25 Apr 2013 13:40:02 -0400 Subject: lockdep: Consolidate bug messages into a single print_lockdep_off() function Also add some missing printk levels. Signed-off-by: Dave Jones <davej@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Link: http://lkml.kernel.org/r/20130425174002.GA26769@redhat.com [ Tweaked the messages a bit. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> --- kernel/lockdep.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c5d1e6bbdb3e..6a3bccba7e7d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -380,6 +380,13 @@ static int verbose(struct lock_class *class) unsigned long nr_stack_trace_entries; static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static void print_lockdep_off(const char *bug_msg) +{ + printk(KERN_DEBUG "%s\n", bug_msg); + printk(KERN_DEBUG "turning off the locking correctness validator.\n"); + printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} + static int save_trace(struct stack_trace *trace) { trace->nr_entries = 0; @@ -409,9 +416,7 @@ static int save_trace(struct stack_trace *trace) if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - printk("Attach output of /proc/lock_stat to bug report\n"); + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); return 0; @@ -764,9 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) } raw_local_irq_restore(flags); - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); - printk("Attach output of /proc/lock_stat to bug report\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); return NULL; } @@ -836,9 +839,7 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); - printk("Attach output of /proc/lock_stat to bug report\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); return NULL; } @@ -2051,9 +2052,7 @@ cache_hit: if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); - printk("turning off the locking correctness validator.\n"); - printk("Attach output of /proc/lock_stat to bug report\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); return 0; } @@ -3192,10 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", + print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); + printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); - printk("turning off the locking correctness validator.\n"); - printk("Attach output of /proc/lock_stat to bug report\n"); lockdep_print_held_locks(current); debug_show_all_locks(); -- cgit v1.2.3 From 25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot <vincent.guittot@linaro.org> Date: Tue, 23 Apr 2013 16:59:02 +0200 Subject: sched: Fix init NOHZ_IDLE flag On my SMP platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle - which makes the scheduler unhappy. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu() between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. A new nohz_idle field is added to sched_domain so both status and sched_domain will share the same RCU lifecycle and will be always synchronized. In addition, there is no more need to protect nohz_idle against concurrent access as it is only modified by 2 exclusive functions called by local cpu. This solution has been prefered to the creation of a new struct with an extra pointer indirection for sched_domain. The synchronization is done at the cost of : - An additional indirection and a rcu_dereference for accessing nohz_idle. - We use only the nohz_idle field of the top sched_domain. Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: fweisbec@gmail.com Cc: pjt@google.com Cc: rostedt@goodmis.org Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366729142-14662-1-git-send-email-vincent.guittot@linaro.org [ Fixed !NO_HZ build bug. ] Signed-off-by: Ingo Molnar <mingo@kernel.org> --- include/linux/sched.h | 2 ++ kernel/sched/fair.c | 26 ++++++++++++++++---------- kernel/sched/sched.h | 1 - 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6bdaa73ede13..a25168f4ab86 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -808,6 +808,8 @@ struct sched_domain { unsigned int wake_idx; unsigned int forkexec_idx; unsigned int smt_gain; + + int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ int level; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index acaf567a03d2..8bf7081b1ec5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5420,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5435,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 605426a63588..4c225c4c7111 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1303,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -- cgit v1.2.3 From cc20e01cd607282d48f8ea538aba10fa850a4312 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 26 Apr 2013 11:58:02 -0700 Subject: cgroup: fix use-after-free when umounting cgroupfs Try: # mount -t cgroup xxx /cgroup # mkdir /cgroup/sub && rmdir /cgroup/sub && umount /cgroup And you might see this: ida_remove called for id=1 which is not allocated. It's because cgroup_kill_sb() is called to destroy root->cgroup_ida and free cgrp->root before ida_simple_removed() is called. What's worse is we're accessing cgrp->root while it has been freed. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 192d762ec1f4..bd4de465d5a9 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -848,9 +848,12 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); + /* * Drop the active superblock reference that we took when we - * created the cgroup + * created the cgroup. This will free cgrp->root, if we are + * holding the last reference to @sb. */ deactivate_super(cgrp->root->sb); @@ -862,7 +865,6 @@ static void cgroup_free_fn(struct work_struct *work) simple_xattrs_free(&cgrp->xattrs); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); kfree(rcu_dereference_raw(cgrp->name)); kfree(cgrp); } -- cgit v1.2.3 From 7ef70e48735e17d2be5c8e8f85052842b16b923a Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Fri, 26 Apr 2013 11:58:03 -0700 Subject: cgroup: restore the call to eventfd->poll() I mistakenly removed the call to eventfd->poll() while I was actually intending to remove the return value... Calling evenfd->poll() will hook cgroup_event_wake() to the poll waitqueue, which will be called to unregister eventfd when rmdir a cgroup or close eventfd. Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cgroup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bd4de465d5a9..3f14a1310d23 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3882,6 +3882,8 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, if (ret) goto fail; + efile->f_op->poll(efile, &event->pt); + /* * Events should be removed after rmdir of cgroup directory, but before * destroying subsystem state objects. Let's take reference to cgroup -- cgit v1.2.3 From e0e80a02e5701c8790bd348ab59edb154fbda60b Mon Sep 17 00:00:00 2001 From: Li Zhong <zhong@linux.vnet.ibm.com> Date: Sat, 27 Apr 2013 06:52:43 -0700 Subject: cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn() In cpuset_hotplug_workfn(), partition_sched_domains() is called without hotplug lock held, which is actually needed (stated in the function header of partition_sched_domains()). This patch tries to use rebuild_sched_domains() to solve the above issue, and makes the code looks a little simpler. Signed-off-by: Li Zhong <zhong@linux.vnet.ibm.com> Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cpuset.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 866d78ee7930..8f0f45e002f2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2172,17 +2172,8 @@ static void cpuset_hotplug_workfn(struct work_struct *work) flush_workqueue(cpuset_propagate_hotplug_wq); /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) { - struct sched_domain_attr *attr; - cpumask_var_t *doms; - int ndoms; - - mutex_lock(&cpuset_mutex); - ndoms = generate_sched_domains(&doms, &attr); - mutex_unlock(&cpuset_mutex); - - partition_sched_domains(ndoms, doms, attr); - } + if (cpus_updated) + rebuild_sched_domains(); } void cpuset_update_active_cpus(bool cpu_online) -- cgit v1.2.3 From 5b16c2a493fe1e439c4e7ad51f58153968ca6cf3 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Sat, 27 Apr 2013 06:52:43 -0700 Subject: cpuset: fix cpu hotplug vs rebuild_sched_domains() race rebuild_sched_domains() might pass doms with offlined cpu to partition_sched_domains(), which results in an oops: general protection fault: 0000 [#1] SMP ... RIP: 0010:[<ffffffff81077a1e>] [<ffffffff81077a1e>] get_group+0x6e/0x90 ... Call Trace: [<ffffffff8107f07c>] build_sched_domains+0x70c/0xcb0 [<ffffffff8107f2a7>] ? build_sched_domains+0x937/0xcb0 [<ffffffff81173f64>] ? kfree+0xe4/0x1b0 [<ffffffff8107f6e0>] ? partition_sched_domains+0xc0/0x470 [<ffffffff8107f905>] partition_sched_domains+0x2e5/0x470 [<ffffffff8107f6e0>] ? partition_sched_domains+0xc0/0x470 [<ffffffff810c9007>] ? generate_sched_domains+0xc7/0x530 [<ffffffff810c94a8>] rebuild_sched_domains_locked+0x38/0x70 [<ffffffff810cb4a4>] cpuset_write_resmask+0x1a4/0x500 [<ffffffff810c8700>] ? cpuset_mount+0xe0/0xe0 [<ffffffff810c7f50>] ? cpuset_read_u64+0x100/0x100 [<ffffffff810be890>] ? cgroup_iter_next+0x90/0x90 [<ffffffff810cb300>] ? cpuset_css_offline+0x70/0x70 [<ffffffff810c1a73>] cgroup_file_write+0x133/0x2e0 [<ffffffff8118995b>] vfs_write+0xcb/0x130 [<ffffffff8118a174>] sys_write+0x64/0xa0 Reported-by: Li Zhong <zhong@linux.vnet.ibm.com> Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cpuset.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8f0f45e002f2..93d140f159cc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -769,12 +769,20 @@ static void rebuild_sched_domains_locked(void) lockdep_assert_held(&cpuset_mutex); get_online_cpus(); + /* + * We have raced with CPU hotplug. Don't do anything to avoid + * passing doms with offlined cpu to partition_sched_domains(). + * Anyways, hotplug work item will rebuild sched domains. + */ + if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) + goto out; + /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); /* Have scheduler rebuild the domains */ partition_sched_domains(ndoms, doms, attr); - +out: put_online_cpus(); } #else /* !CONFIG_SMP */ -- cgit v1.2.3 From 2a0010af17b1739ef8ea8cf02647a127241ee674 Mon Sep 17 00:00:00 2001 From: Li Zefan <lizefan@huawei.com> Date: Sun, 28 Apr 2013 09:46:57 +0800 Subject: cpuset: fix compile warning when CONFIG_SMP=n Reported by Fengguang's kbuild test robot: kernel/cpuset.c:787: warning: 'generate_sched_domains' defined but not used Introduced by commit e0e80a02e5701c8790bd348ab59edb154fbda60b ("cpuset: use rebuild_sched_domains() in cpuset_hotplug_workfn()), which removed generate_sched_domains() from cpuset_hotplug_workfn(). Reported-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Li Zefan <lizefan@huawei.com> Signed-off-by: Tejun Heo <tj@kernel.org> --- kernel/cpuset.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 93d140f159cc..9bd30b9c430c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -789,13 +789,6 @@ out: static void rebuild_sched_domains_locked(void) { } - -static int generate_sched_domains(cpumask_var_t **domains, - struct sched_domain_attr **attributes) -{ - *domains = NULL; - return 1; -} #endif /* CONFIG_SMP */ void rebuild_sched_domains(void) -- cgit v1.2.3 From 4ecdafc8084fe1d95bb59ed7753b345abcd586fb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Mon, 29 Apr 2013 15:05:01 -0700 Subject: kthread: introduce to_live_kthread() "k->vfork_done != NULL" with a barrier() after to_kthread(k) in task_get_live_kthread(k) looks unclear, and sub-optimal because we load ->vfork_done twice. All we need is to ensure that we do not return to_kthread(NULL). Add a new trivial helper which loads/checks ->vfork_done once, this also looks more understandable. Signed-off-by: Oleg Nesterov <oleg@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Namhyung Kim <namhyung@kernel.org> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/kthread.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 9eb7fed0bbaa..a84dee265f94 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,8 +52,21 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define to_kthread(tsk) \ - container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork) \ + container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ + return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ + struct completion *vfork = ACCESS_ONCE(k->vfork_done); + if (likely(vfork)) + return __to_kthread(vfork); + return NULL; +} /** * kthread_should_stop - should this kthread return now? @@ -313,15 +326,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), static struct kthread *task_get_live_kthread(struct task_struct *k) { - struct kthread *kthread; - get_task_struct(k); - kthread = to_kthread(k); - /* It might have exited */ - barrier(); - if (k->vfork_done != NULL) - return kthread; - return NULL; + return to_live_kthread(k); } static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) -- cgit v1.2.3 From b5c5442bb6bce0c67701d55124be561043a51faf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov <oleg@redhat.com> Date: Mon, 29 Apr 2013 15:05:12 -0700 Subject: kthread: kill task_get_live_kthread() task_get_live_kthread() looks confusing and unneeded. It does get_task_struct() but only kthread_stop() needs this, it can be called even if the calller doesn't have a reference when we know that this kthread can't exit until we do kthread_stop(). kthread_park() and kthread_unpark() do not need get_task_struct(), the callers already have the reference. And it can not help if we can race with the exiting kthread anyway, kthread_park() can hang forever in this case. Change kthread_park() and kthread_unpark() to use to_live_kthread(), change kthread_stop() to do get_task_struct() by hand and remove task_get_live_kthread(). Signed-off-by: Oleg Nesterov <oleg@redhat.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Namhyung Kim <namhyung@kernel.org> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/kthread.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index a84dee265f94..9b12d65186f7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -324,12 +324,6 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static struct kthread *task_get_live_kthread(struct task_struct *k) -{ - get_task_struct(k); - return to_live_kthread(k); -} - static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) { clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); @@ -356,11 +350,10 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) */ void kthread_unpark(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); if (kthread) __kthread_unpark(k, kthread); - put_task_struct(k); } /** @@ -377,7 +370,7 @@ void kthread_unpark(struct task_struct *k) */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); int ret = -ENOSYS; if (kthread) { @@ -390,7 +383,6 @@ int kthread_park(struct task_struct *k) } ret = 0; } - put_task_struct(k); return ret; } @@ -411,10 +403,13 @@ int kthread_park(struct task_struct *k) */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread; int ret; trace_sched_kthread_stop(k); + + get_task_struct(k); + kthread = to_live_kthread(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); __kthread_unpark(k, kthread); @@ -422,10 +417,9 @@ int kthread_stop(struct task_struct *k) wait_for_completion(&kthread->exited); } ret = k->exit_code; - put_task_struct(k); - trace_sched_kthread_stop_ret(ret); + trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); -- cgit v1.2.3 From 3f68613f39cdc242fa2e872ac04a802e7cc7b7cb Mon Sep 17 00:00:00 2001 From: Rakib Mullick <rakib.mullick@gmail.com> Date: Mon, 29 Apr 2013 15:05:13 -0700 Subject: kernel/auditsc.c: use kzalloc instead of kmalloc+memset In audit_alloc_context() use kzalloc instead of kmalloc+memset. Also rename audit_zero_context() to audit_set_context(), to represent it's inner workings properly. [akpm@linux-foundation.org: remove audit_set_context() altogether - fold it into its caller] Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Eric Paris <eparis@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/auditsc.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a9..c68229411a7c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context) } } -static inline void audit_zero_context(struct audit_context *context, - enum audit_state state) -{ - memset(context, 0, sizeof(*context)); - context->state = state; - context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} - static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) return NULL; - audit_zero_context(context, state); + context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); return context; -- cgit v1.2.3 From 13f51e1c3fbebeab801f768f433067ff075dea5a Mon Sep 17 00:00:00 2001 From: Gao feng <gaofeng@cn.fujitsu.com> Date: Mon, 29 Apr 2013 15:05:14 -0700 Subject: audit: don't check if kauditd is valid every time We only need to check if kauditd is valid after we start it, if kauditd is invalid, we will set kauditd_task to NULL. So next time, we will start kauditd again. It means if kauditd_task is not NULL,it must be valid. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> Cc: Eric Paris <eparis@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/audit.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f15..9816a1b96cfc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* As soon as there's any sign of userspace auditd, * start kauditd to talk to it */ - if (!kauditd_task) + if (!kauditd_task) { kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } } - loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); -- cgit v1.2.3 From 374c586d95f288296e1e718533401d8ce0eecee3 Mon Sep 17 00:00:00 2001 From: Gao feng <gaofeng@cn.fujitsu.com> Date: Mon, 29 Apr 2013 15:05:15 -0700 Subject: audit: remove duplicate export of audit_enabled audit_enabled has already been exported in include/linux/audit.h. and kernel/audit.h includes include/linux/audit.h, no need to export aduit_enabled again in kernel/audit.h Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Eric Paris <eparis@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/audit.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1b..d06ffc144f81 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -60,7 +60,6 @@ struct audit_entry { }; #ifdef CONFIG_AUDIT -extern int audit_enabled; extern int audit_ever_enabled; #endif -- cgit v1.2.3 From dde5b7d6e7be308ce371baa96058c2d40df26c05 Mon Sep 17 00:00:00 2001 From: Gao feng <gaofeng@cn.fujitsu.com> Date: Mon, 29 Apr 2013 15:05:16 -0700 Subject: audit: remove unnecessary #if CONFIG_AUDIT The files which include kernel/audit.h are complied only when CONFIG_AUDIT is set. Just like audit_pid, there is no need to surround audit_ever_enabled with CONFIG_AUDIT. Signed-off-by: Gao feng <gaofeng@cn.fujitsu.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Eric Paris <eparis@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/audit.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.h b/kernel/audit.h index d06ffc144f81..11468d99dad0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -59,9 +59,7 @@ struct audit_entry { struct audit_krule rule; }; -#ifdef CONFIG_AUDIT extern int audit_ever_enabled; -#endif extern int audit_pid; -- cgit v1.2.3 From 373e0f3408fe671550d69d9a7965d8a49e988525 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Mon, 29 Apr 2013 15:05:18 -0700 Subject: kernel/auditfilter.c: tree and watch will memory leak when failure occurs In audit_data_to_entry() when a failure occurs we must check and free the tree and watch to avoid a memory leak. test: plan: test command: "auditctl -a exit,always -w /etc -F auid=-1" (on fedora17, need modify auditctl to let "-w /etc" has effect) running: under fedora17 x86_64, 2 CPUs 3.20GHz, 2.5GB RAM. let 15 auditctl processes continue running at the same time. monitor command: watch -d -n 1 "cat /proc/meminfo | awk '{print \$2}' \ | head -n 4 | xargs \ | awk '{print \"used \",\$1 - \$2 - \$3 - \$4}'" result: for original version: will use up all memory, within 3 hours. kill all auditctl, the memory still does not free. for new version (apply this patch): after 14 hours later, not find issues. Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: Eric Paris <eparis@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/auditfilter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06f..267436826c3b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -594,6 +594,10 @@ exit_nofree: return entry; exit_free: + if (entry->rule.watch) + audit_put_watch(entry->rule.watch); /* matches initial get */ + if (entry->rule.tree) + audit_put_tree(entry->rule.tree); /* that's the temporary one */ audit_free_rule(entry); return ERR_PTR(err); } -- cgit v1.2.3 From 12b2f117f3bf738c1a00a6f64393f1953a740bd4 Mon Sep 17 00:00:00 2001 From: Chen Gang <gang.chen@asianux.com> Date: Mon, 29 Apr 2013 15:05:19 -0700 Subject: kernel/audit_tree.c: tree will leak memory when failure occurs in audit_trim_trees() audit_trim_trees() calls get_tree(). If a failure occurs we must call put_tree(). [akpm@linux-foundation.org: run put_tree() before mutex_lock() for small scalability improvement] Signed-off-by: Chen Gang <gang.chen@asianux.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Eric Paris <eparis@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/audit_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d6..a291aa23fb3f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void) } spin_unlock(&hash_lock); trim_marked(tree); - put_tree(tree); drop_collected_mounts(root_mnt); skip_it: + put_tree(tree); mutex_lock(&audit_filter_mutex); } list_del(&cursor); -- cgit v1.2.3 From e07cee23e64137f7da2fb35d7b7c0ad26cc0edec Mon Sep 17 00:00:00 2001 From: Jiang Liu <liuj97@gmail.com> Date: Mon, 29 Apr 2013 15:06:58 -0700 Subject: mm,kexec: use common help functions to free reserved pages Use common help functions to free reserved pages. Signed-off-by: Jiang Liu <jiang.liu@huawei.com> Cc: Eric Biederman <ebiederm@xmission.com> Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/kexec.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ffd4e111fd67..b19181d44201 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, { unsigned long addr; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; - } + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) -- cgit v1.2.3 From 6d2488f64a240191f0733c1f32d73607916b01b7 Mon Sep 17 00:00:00 2001 From: Michal Hocko <mhocko@suse.cz> Date: Mon, 29 Apr 2013 15:07:21 -0700 Subject: cgroup: remove css_get_next Now that we have generic and well ordered cgroup tree walkers there is no need to keep css_get_next in the place. Signed-off-by: Michal Hocko <mhocko@suse.cz> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizefan@huawei.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Ying Han <yinghan@google.com> Cc: Tejun Heo <htejun@gmail.com> Cc: Glauber Costa <glommer@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/cgroup.h | 7 ------- kernel/cgroup.c | 49 ------------------------------------------------- 2 files changed, 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 900af5964f55..470073bf93d0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id); -/* - * Get a cgroup whose id is greater than or equal to id under tree of root. - * Returning a cgroup_subsys_state or NULL. - */ -struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid); - /* Returns true if root is ancestor of cg */ bool css_is_ancestor(struct cgroup_subsys_state *cg, const struct cgroup_subsys_state *root); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32f9432666c..dfaf50d4705e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5416,55 +5416,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid) -{ - struct cgroup_subsys_state *ret = NULL; - struct css_id *tmp; - int tmpid; - int rootid = css_id(root); - int depth = css_depth(root); - - if (!rootid) - return NULL; - - BUG_ON(!ss->use_id); - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* fill start point for scan */ - tmpid = id; - while (1) { - /* - * scan next entry from bitmap(tree), tmpid is updated after - * idr_get_next(). - */ - tmp = idr_get_next(&ss->idr, &tmpid); - if (!tmp) - break; - if (tmp->depth >= depth && tmp->stack[depth] == rootid) { - ret = rcu_dereference(tmp->css); - if (ret) { - *foundid = tmpid; - break; - } - } - /* continue to scan from next id */ - tmpid = tmpid + 1; - } - return ret; -} - /* * get corresponding css from file open on cgroupfs directory */ -- cgit v1.2.3 From 146732ce104ddfed3d4d82722c0b336074016b92 Mon Sep 17 00:00:00 2001 From: Josh Triplett <josh@joshtriplett.org> Date: Mon, 29 Apr 2013 15:07:22 -0700 Subject: fs: don't compile in drop_caches.c when CONFIG_SYSCTL=n drop_caches.c provides code only invokable via sysctl, so don't compile it in when CONFIG_SYSCTL=n. Signed-off-by: Josh Triplett <josh@joshtriplett.org> Acked-by: Kees Cook <keescook@chromium.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- fs/Makefile | 3 ++- include/linux/mm.h | 4 ++++ kernel/sysctl.c | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/Makefile b/fs/Makefile index 9d53192236fc..3b2c76759ec9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o super.o \ ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o \ + pnode.o splice.o sync.o utimes.o \ stack.o fs_struct.o statfs.o ifeq ($(CONFIG_BLOCK),y) @@ -49,6 +49,7 @@ obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o xattr_acl.o obj-$(CONFIG_NFS_COMMON) += nfs_common/ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_COREDUMP) += coredump.o +obj-$(CONFIG_SYSCTL) += drop_caches.o obj-$(CONFIG_FHANDLE) += fhandle.o diff --git a/include/linux/mm.h b/include/linux/mm.h index 43b70d5f8201..98a44376e4f3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1731,8 +1731,12 @@ int in_gate_area_no_mm(unsigned long addr); #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ +#ifdef CONFIG_SYSCTL +extern int sysctl_drop_caches; int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +#endif + unsigned long shrink_slab(struct shrink_control *shrink, unsigned long nr_pages_scanned, unsigned long lru_pages); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f8..3dadde52253c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; #endif extern int pid_max; extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; -- cgit v1.2.3 From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim <js1304@gmail.com> Date: Mon, 29 Apr 2013 15:07:37 -0700 Subject: mm, vmalloc: export vmap_area_list, instead of vmlist Although our intention is to unexport internal structure entirely, but there is one exception for kexec. kexec dumps address of vmlist and makedumpfile uses this information. We are about to remove vmlist, then another way to retrieve information of vmalloc layer is needed for makedumpfile. For this purpose, we export vmap_area_list, instead of vmlist. Signed-off-by: Joonsoo Kim <js1304@gmail.com> Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Dave Anderson <anderson@redhat.com> Cc: Vivek Goyal <vgoyal@redhat.com> Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: Ingo Molnar <mingo@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/vmalloc.h | 3 +-- kernel/kexec.c | 2 +- mm/nommu.c | 3 +-- mm/vmalloc.c | 11 ++++++----- 4 files changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 698b1e50d3a4..8a25f9081ed0 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count); /* * Internals. Dont't use.. */ -extern rwlock_t vmlist_lock; -extern struct vm_struct *vmlist; +extern struct list_head vmap_area_list; extern __init void vm_area_add_early(struct vm_struct *vm); extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); diff --git a/kernel/kexec.c b/kernel/kexec.c index b19181d44201..0b1f7e780d46 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1577,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmlist); + VMCOREINFO_SYMBOL(vmap_area_list); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); diff --git a/mm/nommu.c b/mm/nommu.c index e001768b14e8..2f1c75ed468e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL(follow_pfn); -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; +LIST_HEAD(vmap_area_list); void vfree(const void *addr) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bda6cef5b97f..7e63984eb585 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -261,7 +261,8 @@ struct vmap_area { }; static DEFINE_SPINLOCK(vmap_area_lock); -static LIST_HEAD(vmap_area_list); +/* Export for kexec only */ +LIST_HEAD(vmap_area_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -272,6 +273,10 @@ static unsigned long cached_align; static unsigned long vmap_area_pcpu_hole; +/*** Old vmalloc interfaces ***/ +static DEFINE_RWLOCK(vmlist_lock); +static struct vm_struct *vmlist; + static struct vmap_area *__find_vmap_area(unsigned long addr) { struct rb_node *n = vmap_area_root.rb_node; @@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) } EXPORT_SYMBOL_GPL(map_vm_area); -/*** Old vmalloc interfaces ***/ -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { -- cgit v1.2.3 From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001 From: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> Date: Mon, 29 Apr 2013 15:07:40 -0700 Subject: kexec, vmalloc: export additional vmalloc layer information Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get the start address of vmalloc region (vmalloc_start). The address which contains vmalloc_start value is represented as below: vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start) However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list) aren't exported as VMCOREINFO. So this patch exports them externally with small cleanup. [akpm@linux-foundation.org: vmalloc.h should include list.h for list_head] Signed-off-by: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Dave Anderson <anderson@redhat.com> Cc: Eric Biederman <ebiederm@xmission.com> Cc: Guan Xuetao <gxt@mprc.pku.edu.cn> Cc: Ingo Molnar <mingo@kernel.org> Cc: Vivek Goyal <vgoyal@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/vmalloc.h | 13 +++++++++++++ kernel/kexec.c | 3 ++- mm/vmalloc.c | 11 ----------- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 8a25f9081ed0..7d5773a99f20 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -3,7 +3,9 @@ #include <linux/spinlock.h> #include <linux/init.h> +#include <linux/list.h> #include <asm/page.h> /* pgprot_t */ +#include <linux/rbtree.h> struct vm_area_struct; /* vma defining user mapping in mm_types.h */ @@ -35,6 +37,17 @@ struct vm_struct { const void *caller; }; +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + struct vm_struct *vm; + struct rcu_head rcu_head; +}; + /* * Highlevel APIs for driver use */ diff --git a/kernel/kexec.c b/kernel/kexec.c index 0b1f7e780d46..b574920cbd4b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1615,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); log_buf_kexec_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 151da8ac53fa..72043d6c88c0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -249,17 +249,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 -struct vmap_area { - unsigned long va_start; - unsigned long va_end; - unsigned long flags; - struct rb_node rb_node; /* address sorted rbtree */ - struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ - struct vm_struct *vm; - struct rcu_head rcu_head; -}; - static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -- cgit v1.2.3 From d8f10cb3d375c34ad668f32ca6e4661ad1fc23b2 Mon Sep 17 00:00:00 2001 From: Andrew Morton <akpm@linux-foundation.org> Date: Mon, 29 Apr 2013 15:08:08 -0700 Subject: kernel/cpuset.c: use register_hotmemory_notifier() Use the new interface, remove one ifdef. No code size changes. We could/should have been using __meminit/__meminitdata here but there's now no point in doing that because all this code is elided at compile time. Cc: Li Zefan <lizefan@huawei.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/cpuset.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 4f9dfe43ecbd..334d983a36b2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2251,7 +2251,6 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } -#ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2263,20 +2262,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self, schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { + .notifier_call = cpuset_track_online_nodes, + .priority = 10, /* ??! */ +}; /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); cpuset_propagate_hotplug_wq = alloc_ordered_workqueue("cpuset_hotplug", 0); -- cgit v1.2.3 From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker <agshew@gmail.com> Date: Mon, 29 Apr 2013 15:08:10 -0700 Subject: mm: limit growth of 3% hardcoded other user reserve Add user_reserve_kbytes knob. Limit the growth of the memory reserved for other user processes to min(3% current process size, user_reserve_pages). Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. user_reserve_pages defaults to min(3% free pages, 128MB) I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ... then adding the RSS of each. This only affects OVERCOMMIT_NEVER mode. Background 1. user reserve __vm_enough_memory reserves a hardcoded 3% of the current process size for other applications when overcommit is disabled. This was done so that a user could recover if they launched a memory hogging process. Without the reserve, a user would easily run into a message such as: bash: fork: Cannot allocate memory 2. admin reserve Additionally, a hardcoded 3% of free memory is reserved for root in both overcommit 'guess' and 'never' modes. This was intended to prevent a scenario where root-cant-log-in and perform recovery operations. Note that this reserve shrinks, and doesn't guarantee a useful reserve. Motivation The two hardcoded memory reserves should be updated to account for current memory sizes. Also, the admin reserve would be more useful if it didn't shrink too much. When the current code was originally written, 1GB was considered "enterprise". Now the 3% reserve can grow to multiple GB on large memory systems, and it only needs to be a few hundred MB at most to enable a user or admin to recover a system with an unwanted memory hogging process. I've found that reducing these reserves is especially beneficial for a specific type of application load: * single application system * one or few processes (e.g. one per core) * allocating all available memory * not initializing every page immediately * long running I've run scientific clusters with this sort of load. A long running job sometimes failed many hours (weeks of CPU time) into a calculation. They weren't initializing all of their memory immediately, and they weren't using calloc, so I put systems into overcommit 'never' mode. These clusters run diskless and have no swap. However, with the current reserves, a user wishing to allocate as much memory as possible to one process may be prevented from using, for example, almost 2GB out of 32GB. The effect is less, but still significant when a user starts a job with one process per core. I have repeatedly seen a set of processes requesting the same amount of memory fail because one of them could not allocate the amount of memory a user would expect to be able to allocate. For example, Message Passing Interfce (MPI) processes, one per core. And it is similar for other parallel programming frameworks. Changing this reserve code will make the overcommit never mode more useful by allowing applications to allocate nearly all of the available memory. Also, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Risks * "bash: fork: Cannot allocate memory" The downside of the first patch-- which creates a tunable user reserve that is only used in overcommit 'never' mode--is that an admin can set it so low that a user may not be able to kill their process, even if they already have a shell prompt. Of course, a user can get in the same predicament with the current 3% reserve--they just have to launch processes until 3% becomes negligible. * root-cant-log-in problem The second patch, adding the tunable rootuser_reserve_pages, allows the admin to shoot themselves in the foot by setting it too small. They can easily get the system into a state where root-can't-log-in. However, the new admin_reserve_kbytes will be safer than the current behavior since the hardcoded 3% of available memory reserve can shrink to something useless in the case where applications have grabbed all available memory. Alternatives * Memory cgroups provide a more flexible way to limit application memory. Not everyone wants to set up cgroups or deal with their overhead. * We could create a fourth overcommit mode which provides smaller reserves. The size of useful reserves may be drastically different depending on the whether the system is embedded or enterprise. * Force users to initialize all of their memory or use calloc. Some users don't want/expect the system to overcommit when they malloc. Overcommit 'never' mode is for this scenario, and it should work well. The new user and admin reserve tunables are simple to use, with low overhead compared to cgroups. The patches preserve current behavior where 3% of memory is less than 128MB, except that the admin reserve doesn't shrink to an unusable size under pressure. The code allows admins to tune for embedded and enterprise usage. FAQ * How is the root-cant-login problem addressed? What happens if admin_reserve_pages is set to 0? Root is free to shoot themselves in the foot by setting admin_reserve_kbytes too low. On x86_64, the minimum useful reserve is: 8MB for overcommit 'guess' 128MB for overcommit 'never' admin_reserve_pages defaults to min(3% free memory, 8MB) So, anyone switching to 'never' mode needs to adjust admin_reserve_pages. * How do you calculate a minimum useful reserve? A user or the admin needs enough memory to login and perform recovery operations, which includes, at a minimum: sshd or login + bash (or some other shell) + top (or ps, kill, etc.) For overcommit 'guess', we can sum resident set sizes (RSS) because we only need enough memory to handle what the recovery programs will typically use. On x86_64 this is about 8MB. For overcommit 'never', we can take the max of their virtual sizes (VSZ) and add the sum of their RSS. We use VSZ instead of RSS because mode forces us to ensure we can fulfill all of the requested memory allocations-- even if the programs only use a fraction of what they ask for. On x86_64 this is about 128MB. When swap is enabled, reserves are useful even when they are as small as 10MB, regardless of overcommit mode. When both swap and overcommit are disabled, then the admin should tune the reserves higher to be absolutley safe. Over 230MB each was safest in my testing. * What happens if user_reserve_pages is set to 0? Note, this only affects overcomitt 'never' mode. Then a user will be able to allocate all available memory minus admin_reserve_kbytes. However, they will easily see a message such as: "bash: fork: Cannot allocate memory" And they won't be able to recover/kill their application. The admin should be able to recover the system if admin_reserve_kbytes is set appropriately. * What's the difference between overcommit 'guess' and 'never'? "Guess" allows an allocation if there are enough free + reclaimable pages. It has a hardcoded 3% of free pages reserved for root. "Never" allows an allocation if there is enough swap + a configurable percentage (default is 50) of physical RAM. It has a hardcoded 3% of free pages reserved for root, like "Guess" mode. It also has a hardcoded 3% of the current process size reserved for additional applications. * Why is overcommit 'guess' not suitable even when an app eventually writes to every page? It takes free pages, file pages, available swap pages, reclaimable slab pages into consideration. In other words, these are all pages available, then why isn't overcommit suitable? Because it only looks at the present state of the system. It does not take into account the memory that other applications have malloced, but haven't initialized yet. It overcommits the system. Test Summary There was little change in behavior in the default overcommit 'guess' mode with swap enabled before and after the patch. This was expected. Systems run most predictably (i.e. no oom kills) in overcommit 'never' mode with swap enabled. This also allowed the most memory to be allocated to a user application. Overcommit 'guess' mode without swap is a bad idea. It is easy to crash the system. None of the other tested combinations crashed. This matches my experience on the Roadrunner supercomputer. Without the tunable user reserve, a system in overcommit 'never' mode and without swap does not allow the admin to recover, although the admin can. With the new tunable reserves, a system in overcommit 'never' mode and without swap can be configured to: 1. maximize user-allocatable memory, running close to the edge of recoverability 2. maximize recoverability, sacrificing allocatable memory to ensure that a user cannot take down a system Test Description Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap System is booted into multiuser console mode, with unnecessary services turned off. Caches were dropped before each test. Hogs are user memtester processes that attempt to allocate all free memory as reported by /proc/meminfo In overcommit 'never' mode, memory_ratio=100 Test Results 3.9.0-rc1-mm1 Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5432/5432 no yes yes guess yes 4 5444/5444 1 yes yes guess no 1 5302/5449 no yes yes guess no 4 - crash no no never yes 1 5460/5460 1 yes yes never yes 4 5460/5460 1 yes yes never no 1 5218/5432 no no yes never no 4 5203/5448 no no yes 3.9.0-rc1-mm1-tunablereserves User and Admin Recovery show their respective reserves, if applicable. Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery ---------- ---- ---- ------------- ---- ------------- -------------- guess yes 1 5419/5419 no - yes 8MB yes guess yes 4 5436/5436 1 - yes 8MB yes guess no 1 5440/5440 * - yes 8MB yes guess no 4 - crash - no 8MB no * process would successfully mlock, then the oom killer would pick it never yes 1 5446/5446 no 10MB yes 20MB yes never yes 4 5456/5456 no 10MB yes 20MB yes never no 1 5387/5429 no 128MB no 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5323/5428 no 226MB barely 8MB barely never no 1 5359/5448 no 10MB no 10MB barely never no 1 5323/5428 no 0MB no 10MB barely never no 1 5332/5428 no 0MB no 50MB yes never no 1 5293/5429 no 0MB no 90MB yes never no 1 5001/5427 no 230MB yes 338MB yes never no 4* 4998/5424 no 230MB yes 338MB yes * more memtesters were launched, able to allocate approximately another 100MB Future Work - Test larger memory systems. - Test an embedded image. - Test other architectures. - Time malloc microbenchmarks. - Would it be useful to be able to set overcommit policy for each memory cgroup? - Some lines are slightly above 80 chars. Perhaps define a macro to convert between pages and kb? Other places in the kernel do this. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_user_reserve() static] Signed-off-by: Andrew Shewmaker <agshew@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/sysctl/vm.txt | 20 +++++++++++++++++++ Documentation/vm/overcommit-accounting | 8 +++++++- include/linux/mm.h | 2 ++ kernel/sysctl.c | 7 +++++++ mm/mmap.c | 35 +++++++++++++++++++++++++++++----- mm/nommu.c | 35 +++++++++++++++++++++++++++++----- 6 files changed, 96 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 078701fdbd4d..f69895738357 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - swappiness +- user_reserve_kbytes - vfs_cache_pressure - zone_reclaim_mode @@ -542,6 +543,7 @@ memory until it actually runs out. When this flag is 2, the kernel uses a "never overcommit" policy that attempts to prevent any overcommit of memory. +Note that user_reserve_kbytes affects this policy. This feature can be very useful because there are a lot of programs that malloc() huge amounts of memory "just-in-case" @@ -645,6 +647,24 @@ The default value is 60. ============================================================== +- user_reserve_kbytes + +When overcommit_memory is set to 2, "never overommit" mode, reserve +min(3% of current process size, user_reserve_kbytes) of free memory. +This is intended to prevent a user from starting a single memory hogging +process, such that they cannot recover (kill the hog). + +user_reserve_kbytes defaults to min(3% of the current process size, 128MB). + +If this is reduced to zero, then the user will be allowed to allocate +all free memory with a single process, minus admin_reserve_kbytes. +Any subsequent attempts to execute a command will result in +"fork: Cannot allocate memory". + +Changing this takes effect whenever an application requests memory. + +============================================================== + vfs_cache_pressure ------------------ diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting index 706d7ed9d8d2..8eaa2fc4b8fa 100644 --- a/Documentation/vm/overcommit-accounting +++ b/Documentation/vm/overcommit-accounting @@ -8,7 +8,9 @@ The Linux kernel supports the following overcommit handling modes default. 1 - Always overcommit. Appropriate for some scientific - applications. + applications. Classic example is code using sparse arrays + and just relying on the virtual memory consisting almost + entirely of zero pages. 2 - Don't overcommit. The total address space commit for the system is not permitted to exceed swap + a @@ -18,6 +20,10 @@ The Linux kernel supports the following overcommit handling modes pages but will receive errors on memory allocation as appropriate. + Useful for applications that want to guarantee their + memory allocations will be available in the future + without having to initialize every page. + The overcommit policy is set via the sysctl `vm.overcommit_memory'. The overcommit percentage is set via `vm.overcommit_ratio'. diff --git a/include/linux/mm.h b/include/linux/mm.h index 7aa11a6736eb..43cfaabbde40 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout; #include <asm/pgtable.h> #include <asm/processor.h> +extern unsigned long sysctl_user_reserve_kbytes; + #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3dadde52253c..6daabb72bdb5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1429,6 +1429,13 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/mm/mmap.c b/mm/mmap.c index 081e6da8e1a4..80a965f35251 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -84,6 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -122,7 +123,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -183,10 +184,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -3094,3 +3098,24 @@ void __init mmap_init(void) ret = percpu_counter_init(&vm_committed_as, 0); VM_BUG_ON(ret); } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) diff --git a/mm/nommu.c b/mm/nommu.c index 2f1c75ed468e..58e4a0a5125f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -63,6 +63,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; +unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -1897,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range); */ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { - unsigned long free, allowed; + unsigned long free, allowed, reserve; vm_acct_memory(pages); @@ -1957,10 +1958,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed -= allowed / 32; allowed += total_swap_pages; - /* Don't let a single process grow too big: - leave 3% of the size of this process for other processes */ - if (mm) - allowed -= mm->total_vm / 32; + /* + * Don't let a single process grow so big a user can't recover + */ + if (mm) { + reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed -= min(mm->total_vm / 32, reserve); + } if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; @@ -2122,3 +2126,24 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size, up_write(&nommu_region_sem); return 0; } + +/* + * Initialise sysctl_user_reserve_kbytes. + * + * This is intended to prevent a user from starting a single memory hogging + * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER + * mode. + * + * The default value is min(3% of free memory, 128MB) + * 128MB is enough to recover with sshd/login, bash, and top/kill. + */ +static int __meminit init_user_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); + return 0; +} +module_init(init_user_reserve) -- cgit v1.2.3 From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001 From: Andrew Shewmaker <agshew@gmail.com> Date: Mon, 29 Apr 2013 15:08:11 -0700 Subject: mm: replace hardcoded 3% with admin_reserve_pages knob Add an admin_reserve_kbytes knob to allow admins to change the hardcoded memory reserve to something other than 3%, which may be multiple gigabytes on large memory systems. Only about 8MB is necessary to enable recovery in the default mode, and only a few hundred MB are required even when overcommit is disabled. This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER. admin_reserve_kbytes is initialized to min(3% free pages, 8MB) I arrived at 8MB by summing the RSS of sshd or login, bash, and top. Please see first patch in this series for full background, motivation, testing, and full changelog. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: make init_admin_reserve() static] Signed-off-by: Andrew Shewmaker <agshew@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- Documentation/sysctl/vm.txt | 30 ++++++++++++++++++++++++++++++ include/linux/mm.h | 1 + kernel/sysctl.c | 7 +++++++ mm/mmap.c | 30 ++++++++++++++++++++++++++---- mm/nommu.c | 30 ++++++++++++++++++++++++++---- 5 files changed, 90 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index f69895738357..dcc75a9ed919 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -18,6 +18,7 @@ files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: +- admin_reserve_kbytes - block_dump - compact_memory - dirty_background_bytes @@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm: ============================================================== +admin_reserve_kbytes + +The amount of free memory in the system that should be reserved for users +with the capability cap_sys_admin. + +admin_reserve_kbytes defaults to min(3% of free pages, 8MB) + +That should provide enough for the admin to log in and kill a process, +if necessary, under the default overcommit 'guess' mode. + +Systems running under overcommit 'never' should increase this to account +for the full Virtual Memory Size of programs used to recover. Otherwise, +root may not be able to log in to recover the system. + +How do you calculate a minimum useful reserve? + +sshd or login + bash (or some other shell) + top (or ps, kill, etc.) + +For overcommit 'guess', we can sum resident set sizes (RSS). +On x86_64 this is about 8MB. + +For overcommit 'never', we can take the max of their virtual sizes (VSZ) +and add the sum of their RSS. +On x86_64 this is about 128MB. + +Changing this takes effect whenever an application requests memory. + +============================================================== + block_dump block_dump enables block I/O debugging when set to a nonzero value. More diff --git a/include/linux/mm.h b/include/linux/mm.h index 43cfaabbde40..c05d7cfbb6b9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout; #include <asm/processor.h> extern unsigned long sysctl_user_reserve_kbytes; +extern unsigned long sysctl_admin_reserve_kbytes; #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6daabb72bdb5..9edcf456e0fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/mm/mmap.c b/mm/mmap.c index 80a965f35251..5485f18e6631 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic ove int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ /* * Make sure vm_committed_as in one cacheline and not cacheline shared with * other variables. It can be updated by several CPUs frequently. @@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = (totalram_pages - hugetlb_total_pages()) * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; /* @@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void) return 0; } module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) diff --git a/mm/nommu.c b/mm/nommu.c index 58e4a0a5125f..fbe3e2f317eb 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ +unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ int heap_stack_gap = 0; atomic_long_t mmap_pages_allocated; @@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free -= totalreserve_pages; /* - * Leave the last 3% for root + * Reserve some for root */ if (!cap_sys_admin) - free -= free / 32; + free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); if (free > pages) return 0; @@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) allowed = totalram_pages * sysctl_overcommit_ratio / 100; /* - * Leave the last 3% for root + * Reserve some 3% for root */ if (!cap_sys_admin) - allowed -= allowed / 32; + allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); allowed += total_swap_pages; /* @@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void) return 0; } module_init(init_user_reserve) + +/* + * Initialise sysctl_admin_reserve_kbytes. + * + * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin + * to log in and kill a memory hogging process. + * + * Systems with more than 256MB will reserve 8MB, enough to recover + * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will + * only reserve 3% of free pages by default. + */ +static int __meminit init_admin_reserve(void) +{ + unsigned long free_kbytes; + + free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + + sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); + return 0; +} +module_init(init_admin_reserve) -- cgit v1.2.3 From ae8e3a915aef5af5ace5936c56f05f0b1502ded1 Mon Sep 17 00:00:00 2001 From: Toshi Kani <toshi.kani@hp.com> Date: Mon, 29 Apr 2013 15:08:17 -0700 Subject: resource: add __adjust_resource() for internal use Add __adjust_resource(), which is called by adjust_resource() internally after the resource_lock is held. There is no interface change to adjust_resource(). This change allows other functions to call __adjust_resource() internally while the resource_lock is held. Signed-off-by: Toshi Kani <toshi.kani@hp.com> Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Acked-by: David Rientjes <rientjes@google.com> Cc: Ram Pai <linuxram@us.ibm.com> Cc: T Makphaibulchoke <tmac@hp.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/resource.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b9..ae246f97c5d3 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -706,24 +706,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; - write_lock(&resource_lock); - if (!parent) goto skip; @@ -751,6 +740,26 @@ skip: result = 0; out: + return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) +{ + int result; + + write_lock(&resource_lock); + result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } -- cgit v1.2.3 From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001 From: Toshi Kani <toshi.kani@hp.com> Date: Mon, 29 Apr 2013 15:08:19 -0700 Subject: resource: add release_mem_region_adjustable() Add release_mem_region_adjustable(), which releases a requested region from a currently busy memory resource. This interface adjusts the matched memory resource accordingly even if the requested region does not match exactly but still fits into. This new interface is intended for memory hot-delete. During bootup, memory resources are inserted from the boot descriptor table, such as EFI Memory Table and e820. Each memory resource entry usually covers the whole contigous memory range. Memory hot-delete request, on the other hand, may target to a particular range of memory resource, and its size can be much smaller than the whole contiguous memory. Since the existing release interfaces like __release_region() require a requested region to be exactly matched to a resource entry, they do not allow a partial resource to be released. This new interface is restrictive (i.e. release under certain conditions), which is consistent with other release interfaces, __release_region() and __release_resource(). Additional release conditions, such as an overlapping region to a resource entry, can be supported after they are confirmed as valid cases. There is no change to the existing interfaces since their restriction is valid for I/O resources. [akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()] [akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily] [akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi] Signed-off-by: Toshi Kani <toshi.kani@hp.com> Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Reviewed-by: Ram Pai <linuxram@us.ibm.com> Cc: T Makphaibulchoke <tmac@hp.com> Cc: Wen Congyang <wency@cn.fujitsu.com> Cc: Tang Chen <tangchen@cn.fujitsu.com> Cc: Jiang Liu <jiang.liu@huawei.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/linux/ioport.h | 4 ++ kernel/resource.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) (limited to 'kernel') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 85ac9b9b72a2..89b7c24a36e9 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *, extern int __check_region(struct resource *, resource_size_t, resource_size_t); extern void __release_region(struct resource *, resource_size_t, resource_size_t); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern int release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); +#endif static inline int __deprecated check_region(resource_size_t s, resource_size_t n) diff --git a/kernel/resource.c b/kernel/resource.c index ae246f97c5d3..4aef8867fd4b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1021,6 +1021,109 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete. The requested region + * is released from a currently busy memory resource. The requested region + * must either match exactly or fit into a single busy resource entry. In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + * supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + * assumes that all children remain in the lower address entry for + * simplicity. Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) +{ + struct resource **p; + struct resource *res; + struct resource *new_res; + resource_size_t end; + int ret = -EINVAL; + + end = start + size - 1; + if ((start < parent->start) || (end > parent->end)) + return ret; + + /* The kzalloc() result gets checked later */ + new_res = kzalloc(sizeof(struct resource), GFP_KERNEL); + + p = &parent->child; + write_lock(&resource_lock); + + while ((res = *p)) { + if (res->start >= end) + break; + + /* look for the next resource if it does not fit into */ + if (res->start > start || res->end < end) { + p = &res->sibling; + continue; + } + + if (!(res->flags & IORESOURCE_MEM)) + break; + + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + + /* found the target resource; let's adjust accordingly */ + if (res->start == start && res->end == end) { + /* free the whole entry */ + *p = res->sibling; + kfree(res); + ret = 0; + } else if (res->start == start && res->end != end) { + /* adjust the start */ + ret = __adjust_resource(res, end + 1, + res->end - end); + } else if (res->start != start && res->end == end) { + /* adjust the end */ + ret = __adjust_resource(res, res->start, + start - res->start); + } else { + /* split into two entries */ + if (!new_res) { + ret = -ENOMEM; + break; + } + new_res->name = res->name; + new_res->start = end + 1; + new_res->end = res->end; + new_res->flags = res->flags; + new_res->parent = res->parent; + new_res->sibling = res->sibling; + new_res->child = NULL; + + ret = __adjust_resource(res, res->start, + start - res->start); + if (ret) + break; + res->sibling = new_res; + new_res = NULL; + } + + break; + } + + write_unlock(&resource_lock); + kfree(new_res); + return ret; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* * Managed region resource */ -- cgit v1.2.3 From ebff7d8f270d045338d9f4796014f4db429a17f9 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Date: Mon, 29 Apr 2013 15:08:56 -0700 Subject: mem hotunplug: fix kfree() of bootmem memory When hot removing memory presented at boot time, following messages are shown: kernel BUG at mm/slub.c:3409! invalid opcode: 0000 [#1] SMP Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod CPU 0 Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15 RIP: kfree+0x232/0x240 Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80) Call Trace: __release_region+0xd4/0xe0 __remove_pages+0x52/0x110 arch_remove_memory+0x89/0xd0 remove_memory+0xc4/0x100 acpi_memory_device_remove+0x6d/0xb1 acpi_device_remove+0x89/0xab __device_release_driver+0x7c/0xf0 device_release_driver+0x2f/0x50 acpi_bus_device_detach+0x6c/0x70 acpi_ns_walk_namespace+0x11a/0x250 acpi_walk_namespace+0xee/0x137 acpi_bus_trim+0x33/0x7a acpi_bus_hot_remove_device+0xc4/0x1a1 acpi_os_execute_deferred+0x27/0x34 process_one_work+0x1f7/0x590 worker_thread+0x11a/0x370 kthread+0xee/0x100 ret_from_fork+0x7c/0xb0 RIP [<ffffffff811c41d2>] kfree+0x232/0x240 RSP <ffff88084678d968> The reason why the messages are shown is to release a resource structure, allocated by bootmem, by kfree(). So when we release a resource structure, we should check whether it is allocated by bootmem or not. But even if we know a resource structure is allocated by bootmem, we cannot release it since SLxB cannot treat it. So for reusing a resource structure, this patch remembers it by using bootmem_resource as follows: When releasing a resource structure by free_resource(), free_resource() checks whether the resource structure is allocated by bootmem or not. If it is allocated by bootmem, free_resource() adds it to bootmem_resource. If it is not allocated by bootmem, free_resource() release it by kfree(). And when getting a new resource structure by get_resource(), get_resource() checks whether bootmem_resource has released resource structures or not. If there is a released resource structure, get_resource() returns it. If there is not a releaed resource structure, get_resource() returns new resource structure allocated by kzalloc(). [akpm@linux-foundation.org: s/get_resource/alloc_resource/] Signed-off-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com> Reviewed-by: Toshi Kani <toshi.kani@hp.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Ram Pai <linuxram@us.ibm.com> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/resource.c | 68 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 4aef8867fd4b..d7386986e10e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> +#include <linux/mm.h> #include <asm/io.h> @@ -50,6 +51,14 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ + if (!res) + return; + + if (!PageSlab(virt_to_head_page(res))) { + spin_lock(&bootmem_resource_lock); + res->sibling = bootmem_resource_free; + bootmem_resource_free = res; + spin_unlock(&bootmem_resource_lock); + } else { + kfree(res); + } +} + +static struct resource *alloc_resource(gfp_t flags) +{ + struct resource *res = NULL; + + spin_lock(&bootmem_resource_lock); + if (bootmem_resource_free) { + res = bootmem_resource_free; + bootmem_resource_free = res->sibling; + } + spin_unlock(&bootmem_resource_lock); + + if (res) + memset(res, 0, sizeof(struct resource)); + else + res = kzalloc(sizeof(struct resource), flags); + + return res; +} + /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { @@ -771,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) @@ -796,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { - kfree(res); + free_resource(res); WARN_ON(next_res); break; } @@ -806,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { - next_res = kzalloc(sizeof(*next_res), - GFP_ATOMIC); + next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { - kfree(res); + free_resource(res); break; } next_res->name = name; @@ -899,7 +941,7 @@ struct resource * __request_region(struct resource *parent, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; @@ -933,7 +975,7 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - kfree(res); + free_resource(res); res = NULL; break; } @@ -967,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, return -EBUSY; release_resource(res); - kfree(res); + free_resource(res); return 0; } EXPORT_SYMBOL(__check_region); @@ -1007,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); - kfree(res); + free_resource(res); return; } p = &res->sibling; @@ -1055,8 +1097,8 @@ int release_mem_region_adjustable(struct resource *parent, if ((start < parent->start) || (end > parent->end)) return ret; - /* The kzalloc() result gets checked later */ - new_res = kzalloc(sizeof(struct resource), GFP_KERNEL); + /* The alloc_resource() result gets checked later */ + new_res = alloc_resource(GFP_KERNEL); p = &parent->child; write_lock(&resource_lock); @@ -1083,7 +1125,7 @@ int release_mem_region_adjustable(struct resource *parent, if (res->start == start && res->end == end) { /* free the whole entry */ *p = res->sibling; - kfree(res); + free_resource(res); ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ @@ -1119,7 +1161,7 @@ int release_mem_region_adjustable(struct resource *parent, } write_unlock(&resource_lock); - kfree(new_res); + free_resource(new_res); return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From 07c65f4d1aa74f7cf1c46d7f96e05cfa3e628ba1 Mon Sep 17 00:00:00 2001 From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com> Date: Mon, 29 Apr 2013 16:17:16 -0700 Subject: printk/tracing: rework console tracing Commit 7ff9554bb578 ("printk: convert byte-buffer to variable-length record buffer") removed start and end parameters from call_console_drivers, but those parameters still exist in include/trace/events/printk.h. Without start and end parameters handling, printk tracing became more simple as: trace_console(text, len); Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com> Acked-by: Steven Rostedt <rostedt@goodmis.org> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Kay Sievers <kay@vrfy.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- include/trace/events/printk.h | 25 ++++++------------------- kernel/printk.c | 2 +- 2 files changed, 7 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/printk.h b/include/trace/events/printk.h index 94ec79cc011a..c008bc99f9fa 100644 --- a/include/trace/events/printk.h +++ b/include/trace/events/printk.h @@ -6,31 +6,18 @@ #include <linux/tracepoint.h> -TRACE_EVENT_CONDITION(console, - TP_PROTO(const char *log_buf, unsigned start, unsigned end, - unsigned log_buf_len), +TRACE_EVENT(console, + TP_PROTO(const char *text, size_t len), - TP_ARGS(log_buf, start, end, log_buf_len), - - TP_CONDITION(start != end), + TP_ARGS(text, len), TP_STRUCT__entry( - __dynamic_array(char, msg, end - start + 1) + __dynamic_array(char, msg, len + 1) ), TP_fast_assign( - if ((start & (log_buf_len - 1)) > (end & (log_buf_len - 1))) { - memcpy(__get_dynamic_array(msg), - log_buf + (start & (log_buf_len - 1)), - log_buf_len - (start & (log_buf_len - 1))); - memcpy((char *)__get_dynamic_array(msg) + - log_buf_len - (start & (log_buf_len - 1)), - log_buf, end & (log_buf_len - 1)); - } else - memcpy(__get_dynamic_array(msg), - log_buf + (start & (log_buf_len - 1)), - end - start); - ((char *)__get_dynamic_array(msg))[end - start] = 0; + memcpy(__get_dynamic_array(msg), text, len); + ((char *)__get_dynamic_array(msg))[len] = 0; ), TP_printk("%s", __get_str(msg)) diff --git a/kernel/printk.c b/kernel/printk.c index abbdd9e2ac82..2de593df036b 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1265,7 +1265,7 @@ static void call_console_drivers(int level, const char *text, size_t len) { struct console *con; - trace_console(text, 0, len, len); + trace_console(text, len); if (level >= console_loglevel && !ignore_loglevel) return; -- cgit v1.2.3 From d0380e6c3c0f6edb986d8798a23acfaf33d5df23 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner <tglx@linutronix.de> Date: Mon, 29 Apr 2013 16:17:18 -0700 Subject: early_printk: consolidate random copies of identical code The early console implementations are the same all over the place. Move the print function to kernel/printk and get rid of the copies. [akpm@linux-foundation.org: arch/mips/kernel/early_printk.c needs kernel.h for va_list] [paul.gortmaker@windriver.com: sh4: make the bios early console support depend on EARLY_PRINTK] Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Russell King <linux@arm.linux.org.uk> Acked-by: Mike Frysinger <vapier@gentoo.org> Cc: Michal Simek <monstr@monstr.eu> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Paul Mundt <lethal@linux-sh.org> Cc: "David S. Miller" <davem@davemloft.net> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Richard Weinberger <richard@nod.at> Reviewed-by: Ingo Molnar <mingo@kernel.org> Tested-by: Paul Gortmaker <paul.gortmaker@windriver.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- arch/arm/kernel/early_printk.c | 17 +++-------------- arch/blackfin/kernel/early_printk.c | 2 -- arch/microblaze/kernel/early_printk.c | 26 ++++---------------------- arch/mips/kernel/early_printk.c | 12 ++++++------ arch/powerpc/kernel/udbg.c | 6 ++---- arch/sh/kernel/sh_bios.c | 4 ++-- arch/tile/kernel/early_printk.c | 27 +++++---------------------- arch/um/kernel/early_printk.c | 8 +++++--- arch/unicore32/kernel/early_printk.c | 12 ++++-------- arch/x86/kernel/early_printk.c | 21 ++------------------- include/linux/console.h | 1 + include/linux/printk.h | 6 ++++++ kernel/printk.c | 30 +++++++++++++++++++++++------- 13 files changed, 63 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/early_printk.c b/arch/arm/kernel/early_printk.c index 85aa2b292692..43076536965c 100644 --- a/arch/arm/kernel/early_printk.c +++ b/arch/arm/kernel/early_printk.c @@ -29,28 +29,17 @@ static void early_console_write(struct console *con, const char *s, unsigned n) early_write(s, n); } -static struct console early_console = { +static struct console early_console_dev = { .name = "earlycon", .write = early_console_write, .flags = CON_PRINTBUFFER | CON_BOOT, .index = -1, }; -asmlinkage void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - - va_start(ap, fmt); - n = vscnprintf(buf, sizeof(buf), fmt, ap); - early_write(buf, n); - va_end(ap); -} - static int __init setup_early_printk(char *buf) { - register_console(&early_console); + early_console = &early_console_dev; + register_console(&early_console_dev); return 0; } diff --git a/arch/blackfin/kernel/early_printk.c b/arch/blackfin/kernel/early_printk.c index 84ed8375113c..61fbd2de993d 100644 --- a/arch/blackfin/kernel/early_printk.c +++ b/arch/blackfin/kernel/early_printk.c @@ -25,8 +25,6 @@ extern struct console *bfin_earlyserial_init(unsigned int port, extern struct console *bfin_jc_early_init(void); #endif -static struct console *early_console; - /* Default console */ #define DEFAULT_PORT 0 #define DEFAULT_CFLAG CS8|B57600 diff --git a/arch/microblaze/kernel/early_printk.c b/arch/microblaze/kernel/early_printk.c index 60dcacc68038..365f2d53f1b2 100644 --- a/arch/microblaze/kernel/early_printk.c +++ b/arch/microblaze/kernel/early_printk.c @@ -21,7 +21,6 @@ #include <asm/setup.h> #include <asm/prom.h> -static u32 early_console_initialized; static u32 base_addr; #ifdef CONFIG_SERIAL_UARTLITE_CONSOLE @@ -109,27 +108,11 @@ static struct console early_serial_uart16550_console = { }; #endif /* CONFIG_SERIAL_8250_CONSOLE */ -static struct console *early_console; - -void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - - if (early_console_initialized) { - va_start(ap, fmt); - n = vscnprintf(buf, 512, fmt, ap); - early_console->write(early_console, buf, n); - va_end(ap); - } -} - int __init setup_early_printk(char *opt) { int version = 0; - if (early_console_initialized) + if (early_console) return 1; base_addr = of_early_console(&version); @@ -159,7 +142,6 @@ int __init setup_early_printk(char *opt) } register_console(early_console); - early_console_initialized = 1; return 0; } return 1; @@ -169,7 +151,7 @@ int __init setup_early_printk(char *opt) * only for early console because of performance degression */ void __init remap_early_printk(void) { - if (!early_console_initialized || !early_console) + if (!early_console) return; pr_info("early_printk_console remapping from 0x%x to ", base_addr); base_addr = (u32) ioremap(base_addr, PAGE_SIZE); @@ -194,9 +176,9 @@ void __init remap_early_printk(void) void __init disable_early_printk(void) { - if (!early_console_initialized || !early_console) + if (!early_console) return; pr_warn("disabling early console\n"); unregister_console(early_console); - early_console_initialized = 0; + early_console = NULL; } diff --git a/arch/mips/kernel/early_printk.c b/arch/mips/kernel/early_printk.c index 9e6440eaa455..505cb77d1280 100644 --- a/arch/mips/kernel/early_printk.c +++ b/arch/mips/kernel/early_printk.c @@ -7,7 +7,9 @@ * Copyright (C) 2007 MIPS Technologies, Inc. * written by Ralf Baechle (ralf@linux-mips.org) */ +#include <linux/kernel.h> #include <linux/console.h> +#include <linux/printk.h> #include <linux/init.h> #include <asm/setup.h> @@ -24,20 +26,18 @@ static void early_console_write(struct console *con, const char *s, unsigned n) } } -static struct console early_console = { +static struct console early_console_prom = { .name = "early", .write = early_console_write, .flags = CON_PRINTBUFFER | CON_BOOT, .index = -1 }; -static int early_console_initialized __initdata; - void __init setup_early_printk(void) { - if (early_console_initialized) + if (early_console) return; - early_console_initialized = 1; + early_console = &early_console_prom; - register_console(&early_console); + register_console(&early_console_prom); } diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c index f9748498fe58..13b867093499 100644 --- a/arch/powerpc/kernel/udbg.c +++ b/arch/powerpc/kernel/udbg.c @@ -156,15 +156,13 @@ static struct console udbg_console = { .index = 0, }; -static int early_console_initialized; - /* * Called by setup_system after ppc_md->probe and ppc_md->early_init. * Call it again after setting udbg_putc in ppc_md->setup_arch. */ void __init register_early_udbg_console(void) { - if (early_console_initialized) + if (early_console) return; if (!udbg_putc) @@ -174,7 +172,7 @@ void __init register_early_udbg_console(void) printk(KERN_INFO "early console immortal !\n"); udbg_console.flags &= ~CON_BOOT; } - early_console_initialized = 1; + early_console = &udbg_console; register_console(&udbg_console); } diff --git a/arch/sh/kernel/sh_bios.c b/arch/sh/kernel/sh_bios.c index 47475cca068a..fe584e516964 100644 --- a/arch/sh/kernel/sh_bios.c +++ b/arch/sh/kernel/sh_bios.c @@ -104,6 +104,7 @@ void sh_bios_vbr_reload(void) ); } +#ifdef CONFIG_EARLY_PRINTK /* * Print a string through the BIOS */ @@ -144,8 +145,6 @@ static struct console bios_console = { .index = -1, }; -static struct console *early_console; - static int __init setup_early_printk(char *buf) { int keep_early = 0; @@ -170,3 +169,4 @@ static int __init setup_early_printk(char *buf) return 0; } early_param("earlyprintk", setup_early_printk); +#endif diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c index afb9c9a0d887..34d72a151bf3 100644 --- a/arch/tile/kernel/early_printk.c +++ b/arch/tile/kernel/early_printk.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/string.h> #include <linux/irqflags.h> +#include <linux/printk.h> #include <asm/setup.h> #include <hv/hypervisor.h> @@ -33,25 +34,8 @@ static struct console early_hv_console = { }; /* Direct interface for emergencies */ -static struct console *early_console = &early_hv_console; -static int early_console_initialized; static int early_console_complete; -static void early_vprintk(const char *fmt, va_list ap) -{ - char buf[512]; - int n = vscnprintf(buf, sizeof(buf), fmt, ap); - early_console->write(early_console, buf, n); -} - -void early_printk(const char *fmt, ...) -{ - va_list ap; - va_start(ap, fmt); - early_vprintk(fmt, ap); - va_end(ap); -} - void early_panic(const char *fmt, ...) { va_list ap; @@ -69,14 +53,13 @@ static int __initdata keep_early; static int __init setup_early_printk(char *str) { - if (early_console_initialized) + if (early_console) return 1; if (str != NULL && strncmp(str, "keep", 4) == 0) keep_early = 1; early_console = &early_hv_console; - early_console_initialized = 1; register_console(early_console); return 0; @@ -85,12 +68,12 @@ static int __init setup_early_printk(char *str) void __init disable_early_printk(void) { early_console_complete = 1; - if (!early_console_initialized || !early_console) + if (!early_console) return; if (!keep_early) { early_printk("disabling early console\n"); unregister_console(early_console); - early_console_initialized = 0; + early_console = NULL; } else { early_printk("keeping early console\n"); } @@ -98,7 +81,7 @@ void __init disable_early_printk(void) void warn_early_printk(void) { - if (early_console_complete || early_console_initialized) + if (early_console_complete || early_console) return; early_printk("\ Machine shutting down before console output is fully initialized.\n\ diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c index 49480f092456..4a0800bc37b2 100644 --- a/arch/um/kernel/early_printk.c +++ b/arch/um/kernel/early_printk.c @@ -16,7 +16,7 @@ static void early_console_write(struct console *con, const char *s, unsigned int um_early_printk(s, n); } -static struct console early_console = { +static struct console early_console_dev = { .name = "earlycon", .write = early_console_write, .flags = CON_BOOT, @@ -25,8 +25,10 @@ static struct console early_console = { static int __init setup_early_printk(char *buf) { - register_console(&early_console); - + if (!early_console) { + early_console = &early_console_dev; + register_console(&early_console_dev); + } return 0; } diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c index 3922255f1fa8..9be0d5d02a9a 100644 --- a/arch/unicore32/kernel/early_printk.c +++ b/arch/unicore32/kernel/early_printk.c @@ -33,21 +33,17 @@ static struct console early_ocd_console = { .index = -1, }; -/* Direct interface for emergencies */ -static struct console *early_console = &early_ocd_console; - -static int __initdata keep_early; - static int __init setup_early_printk(char *buf) { - if (!buf) + int keep_early; + + if (!buf || early_console) return 0; if (strstr(buf, "keep")) keep_early = 1; - if (!strncmp(buf, "ocd", 3)) - early_console = &early_ocd_console; + early_console = &early_ocd_console; if (keep_early) early_console->flags &= ~CON_BOOT; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 9b9f18b49918..d15f575a861b 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -169,25 +169,9 @@ static struct console early_serial_console = { .index = -1, }; -/* Direct interface for emergencies */ -static struct console *early_console = &early_vga_console; -static int __initdata early_console_initialized; - -asmlinkage void early_printk(const char *fmt, ...) -{ - char buf[512]; - int n; - va_list ap; - - va_start(ap, fmt); - n = vscnprintf(buf, sizeof(buf), fmt, ap); - early_console->write(early_console, buf, n); - va_end(ap); -} - static inline void early_console_register(struct console *con, int keep_early) { - if (early_console->index != -1) { + if (con->index != -1) { printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", con->name); return; @@ -207,9 +191,8 @@ static int __init setup_early_printk(char *buf) if (!buf) return 0; - if (early_console_initialized) + if (early_console) return 0; - early_console_initialized = 1; keep = (strstr(buf, "keep") != NULL); diff --git a/include/linux/console.h b/include/linux/console.h index 29680a8cda99..73bab0f58af5 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -141,6 +141,7 @@ struct console { for (con = console_drivers; con != NULL; con = con->next) extern int console_set_on_cmdline; +extern struct console *early_console; extern int add_preferred_console(char *name, int idx, char *options); extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options); diff --git a/include/linux/printk.h b/include/linux/printk.h index 822171fcb1c8..1c35c23bccd7 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -95,8 +95,14 @@ int no_printk(const char *fmt, ...) return 0; } +#ifdef CONFIG_EARLY_PRINTK extern asmlinkage __printf(1, 2) void early_printk(const char *fmt, ...); +void early_vprintk(const char *fmt, va_list ap); +#else +static inline __printf(1, 2) __cold +void early_printk(const char *s, ...) { } +#endif #ifdef CONFIG_PRINTK asmlinkage __printf(5, 0) diff --git a/kernel/printk.c b/kernel/printk.c index 2de593df036b..1c8ca176458f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -49,13 +49,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/printk.h> -/* - * Architectures can override it: - */ -void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) -{ -} - /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -1723,6 +1716,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ +#ifdef CONFIG_EARLY_PRINTK +struct console *early_console; + +void early_vprintk(const char *fmt, va_list ap) +{ + if (early_console) { + char buf[512]; + int n = vscnprintf(buf, sizeof(buf), fmt, ap); + + early_console->write(early_console, buf, n); + } +} + +asmlinkage void early_printk(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + early_vprintk(fmt, ap); + va_end(ap); +} +#endif + static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) { -- cgit v1.2.3 From 0a285317daf785424c884d1a7548705c60bc42cc Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser <nikai@nikai.net> Date: Mon, 29 Apr 2013 16:17:20 -0700 Subject: printk: fix failure to return error in devkmsg_poll() Error value got overwritten instantly. Signed-off-by: Nicolas Kaiser <nikai@nikai.net> Cc: Kay Sievers <kay.sievers@vrfy.org> Cc: Greg KH <greg@kroah.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/printk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 1c8ca176458f..376914e2869d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -601,7 +601,8 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; - ret = POLLIN|POLLRDNORM; + else + ret = POLLIN|POLLRDNORM; } raw_spin_unlock_irq(&logbuf_lock); -- cgit v1.2.3 From 6d65df3325c380f3c897330c48f0e53d73b8f362 Mon Sep 17 00:00:00 2001 From: Akinobu Mita <akinobu.mita@gmail.com> Date: Mon, 29 Apr 2013 16:21:30 -0700 Subject: kernel/: rename random32() to prandom_u32() Use preferable function name which implies using a pseudo-random number generator. Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/rcutree.c | 2 +- kernel/test_kprobes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 5b8ad827fd86..2f8530bc4b17 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1319,7 +1319,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rnp->grphi, rnp->qsmask); raw_spin_unlock_irq(&rnp->lock); #ifdef CONFIG_PROVE_RCU_DELAY - if ((random32() % (rcu_num_nodes * 8)) == 0) + if ((prandom_u32() % (rcu_num_nodes * 8)) == 0) schedule_timeout_uninterruptible(2); #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ cond_resched(); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index f8b11a283171..12d6ebbfdd83 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -365,7 +365,7 @@ int init_test_probes(void) target2 = kprobe_target2; do { - rand1 = random32(); + rand1 = prandom_u32(); } while (rand1 <= div_factor); printk(KERN_INFO "Kprobe smoke test started\n"); -- cgit v1.2.3 From 6c24499f40d96bf07a85b709fb1bee5cea611a1d Mon Sep 17 00:00:00 2001 From: Steven Rostedt <rostedt@goodmis.org> Date: Mon, 29 Apr 2013 20:08:14 -0400 Subject: tracing: Fix small merge bug During the 3.10 merge, a conflict happened and the resolution was almost, but not quite, correct. An if statement was reversed. Signed-off-by: Steven Rostedt <rostedt@goodmis.org> [ Duh. That was just silly of me - Linus ] Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 581630a6387d..ae6fa2d1cdf7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -904,7 +904,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) return; WARN_ON_ONCE(!irqs_disabled()); - if (tr->allocated_snapshot) { + if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ WARN_ON_ONCE(tr->current_trace != &nop_trace); return; -- cgit v1.2.3