From 2a01bb3885c9145dbb7583d5aa5f5d5504f6f46f Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Wed, 11 Apr 2012 08:15:29 -0400 Subject: panic: Make panic_on_oops configurable Several distros set this by default by patching panic_on_oops. It seems to fit with the BOOTPARAM_{HARD,SOFT}_PANIC options though, so let's add a Kconfig entry and reduce some more upstream delta. Signed-off-by: Kyle McMartin Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120411121529.GH26688@redacted.bos.redhat.com Signed-off-by: Ingo Molnar --- kernel/panic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 8ed89a175d79..b6215b7ce99d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,7 +27,7 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 -int panic_on_oops; +int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; -- cgit v1.2.3 From 62be73eafaa045d3233337303fb140f7f8a61135 Mon Sep 17 00:00:00 2001 From: Seiji Aguchi Date: Tue, 15 May 2012 17:35:09 -0400 Subject: kdump: Execute kmsg_dump(KMSG_DUMP_PANIC) after smp_send_stop() This patch moves kmsg_dump(KMSG_DUMP_PANIC) below smp_send_stop(), to serialize the crash-logging process via smp_send_stop() and to thus retrieve a more stable crash image of all CPUs stopped. Signed-off-by: Seiji Aguchi Acked-by: Don Zickus Cc: dle-develop@lists.sourceforge.net Cc: Satoru Moriya Cc: Tony Luck Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/5C4C569E8A4B9B42A84A977CF070A35B2E4D7A5CE2@USINDEVS01.corp.hds.com Signed-off-by: Ingo Molnar --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index b6215b7ce99d..d2a5f4ecc6dd 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -108,8 +108,6 @@ void panic(const char *fmt, ...) */ crash_kexec(NULL); - kmsg_dump(KMSG_DUMP_PANIC); - /* * Note smp_send_stop is the usual smp shutdown function, which * unfortunately means it may not be hardened to work in a panic @@ -117,6 +115,8 @@ void panic(const char *fmt, ...) */ smp_send_stop(); + kmsg_dump(KMSG_DUMP_PANIC); + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); bust_spinlocks(0); -- cgit v1.2.3 From 967db0ea65b0bf8507a7643ac8f296c4f2c0a834 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Wed, 6 Jun 2012 18:51:35 -0700 Subject: cgroup: make sure that decisions in __css_put are atomic __css_put is using atomic_dec on the ref count, and then looking at the ref count to make decisions. This is prone to races, as someone else may decrement ref count between our decrement and our decision. Instead, we should base our decisions on the value that we decremented the ref count to. (This results in an actual race on Google's kernel which I haven't been able to reproduce on the upstream kernel. Having said that, it's still incorrect by inspection). Signed-off-by: Salman Qazi Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 72fcd3069a90..ceeafe874b3f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4984,8 +4984,7 @@ void __css_put(struct cgroup_subsys_state *css) struct cgroup *cgrp = css->cgroup; rcu_read_lock(); - atomic_dec(&css->refcnt); - switch (css_refcnt(css)) { + switch (atomic_dec_return(&css->refcnt)) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.2.3 From f2bf1f6f5f89d031245067512449fc889b2f4bb2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 Jun 2012 19:50:40 -0400 Subject: tracing: Have tracing_off() actually turn tracing off A recent update to have tracing_on/off() only affect the ftrace ring buffers instead of all ring buffers had a cut and paste error. The tracing_off() did the exact same thing as tracing_on() and would not actually turn off tracing. Unfortunately, tracing_off() is more important to be working than tracing_on() as this is a key development tool, as it lets the developer turn off tracing as soon as a problem is discovered. It is also used by panic and oops code. This bug also breaks the 'echo func:traceoff > set_ftrace_filter' Cc: # 3.4 Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..49249c28690d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -371,7 +371,7 @@ EXPORT_SYMBOL_GPL(tracing_on); void tracing_off(void) { if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + ring_buffer_record_off(global_trace.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race -- cgit v1.2.3 From 8f5af6f1f2d09fe5eac86a5dc1731a5917c1503a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 4 May 2012 08:31:53 -0700 Subject: rcu: RCU_FAST_NO_HZ detection of callback adoption In the present implementations of CPU hotplug, the outgoing CPU is guaranteed to run its stop-machine process on the way out, which will guarantee that RCU_FAST_NO_HZ forces the CPU out of dyntick-idle mode. However, new versions of CPU hotplug might not work this way. This commit therefore removes this design constraint by explicitly notifying CPUs when they adopt non-lazy RCU callbacks. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0da7b88d92d0..3b0f1337f75b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1397,6 +1397,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rdp->qlen_lazy += rsp->qlen_lazy; rdp->qlen += rsp->qlen; rdp->n_cbs_adopted += rsp->qlen; + if (rsp->qlen_lazy != rsp->qlen) + rcu_idle_count_callbacks_posted(); rsp->qlen_lazy = 0; rsp->qlen = 0; -- cgit v1.2.3 From fd4b352687fd8604d49c190c4c9ea9e369fd42d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 May 2012 19:10:35 -0700 Subject: rcu: Update RCU_FAST_NO_HZ tracing for lazy callbacks In the current code, a short dyntick-idle interval (where there is at least one non-lazy callback on the CPU) and a long dyntick-idle interval (where there are only lazy callbacks on the CPU) are traced identically, which can be less than helpful. This commit therefore emits different event traces in these two cases. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- include/trace/events/rcu.h | 1 + kernel/rcutree_plugin.h | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 1480900c511c..d274734b2aa4 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -289,6 +289,7 @@ TRACE_EVENT(rcu_dyntick, * "In holdoff": Nothing to do, holding off after unsuccessful attempt. * "Begin holdoff": Attempt failed, don't retry until next jiffy. * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. + * "Dyntick with lazy callbacks": Entering dyntick-idle w/lazy callbacks. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! * "Timer": Timer fired to cause CPU to continue processing callbacks. diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d9869..5449f02c4820 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2165,15 +2165,17 @@ static void rcu_prepare_for_idle(int cpu) !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_GP_DELAY; - else + } else { per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_LAZY_GP_DELAY; + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } tp = &per_cpu(rcu_idle_gp_timer, cpu); mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); per_cpu(rcu_nonlazy_posted_snap, cpu) = -- cgit v1.2.3 From 5955f7eecd77d6b440db278b266cfecdb72ecd00 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 May 2012 12:07:05 -0700 Subject: rcu: Move RCU_FAST_NO_HZ per-CPU variables to rcu_dynticks structure The RCU_FAST_NO_HZ code relies on a number of per-CPU variables. This works, but is hidden from someone scanning the data structures in rcutree.h. This commit therefore converts these per-CPU variables to fields in the per-CPU rcu_dynticks structures. Suggested-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- kernel/rcutree.h | 14 +++++++ kernel/rcutree_plugin.h | 99 ++++++++++++++++++++++--------------------------- 2 files changed, 58 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 7f5d138dedf5..ea056495783e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -84,6 +84,20 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_RCU_FAST_NO_HZ + int dyntick_drain; /* Prepare-for-idle state variable. */ + unsigned long dyntick_holdoff; + /* No retries for the jiffy of failure. */ + struct timer_list idle_gp_timer; + /* Wake up CPU sleeping with callbacks. */ + unsigned long idle_gp_timer_expires; + /* When to wake up CPU (for repost). */ + bool idle_first_pass; /* First pass of attempt to go idle? */ + unsigned long nonlazy_posted; + /* # times non-lazy CBs posted to CPU. */ + unsigned long nonlazy_posted_snap; + /* idle-period nonlazy_posted snapshot. */ +#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ }; /* RCU's kthread states for tracing. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5449f02c4820..6bd9637d5d83 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1962,21 +1962,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* Loop counter for rcu_prepare_for_idle(). */ -static DEFINE_PER_CPU(int, rcu_dyntick_drain); -/* If rcu_dyntick_holdoff==jiffies, don't try to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); -/* Timer to awaken the CPU if it enters dyntick-idle mode with callbacks. */ -static DEFINE_PER_CPU(struct timer_list, rcu_idle_gp_timer); -/* Scheduled expiry time for rcu_idle_gp_timer to allow reposting. */ -static DEFINE_PER_CPU(unsigned long, rcu_idle_gp_timer_expires); -/* Enable special processing on first attempt to enter dyntick-idle mode. */ -static DEFINE_PER_CPU(bool, rcu_idle_first_pass); -/* Running count of non-lazy callbacks posted, never decremented. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted); -/* Snapshot of rcu_nonlazy_posted to detect meaningful exits from idle. */ -static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); - /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no * callbacks on this CPU, (2) this CPU has not yet attempted to enter @@ -1988,13 +1973,15 @@ static DEFINE_PER_CPU(unsigned long, rcu_nonlazy_posted_snap); */ int rcu_needs_cpu(int cpu) { + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + /* Flag a new idle sojourn to the idle-entry state machine. */ - per_cpu(rcu_idle_first_pass, cpu) = 1; + rdtp->idle_first_pass = 1; /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(cpu)) return 0; /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; + return rdtp->dyntick_holdoff == jiffies; } /* @@ -2075,21 +2062,24 @@ static void rcu_idle_gp_timer_func(unsigned long cpu_in) */ static void rcu_prepare_for_idle_init(int cpu) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - setup_timer(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_timer_func, cpu); - per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies - 1; - per_cpu(rcu_idle_first_pass, cpu) = 1; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + rdtp->dyntick_holdoff = jiffies - 1; + setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu); + rdtp->idle_gp_timer_expires = jiffies - 1; + rdtp->idle_first_pass = 1; } /* * Clean up for exit from idle. Because we are exiting from idle, there - * is no longer any point to rcu_idle_gp_timer, so cancel it. This will + * is no longer any point to ->idle_gp_timer, so cancel it. This will * do nothing if this timer is not active, so just cancel it unconditionally. */ static void rcu_cleanup_after_idle(int cpu) { - del_timer(&per_cpu(rcu_idle_gp_timer, cpu)); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + del_timer(&rdtp->idle_gp_timer); trace_rcu_prep_idle("Cleanup after idle"); } @@ -2108,42 +2098,41 @@ static void rcu_cleanup_after_idle(int cpu) * Because it is not legal to invoke rcu_process_callbacks() with irqs * disabled, we do one pass of force_quiescent_state(), then do a * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked - * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + * later. The ->dyntick_drain field controls the sequencing. * * The caller must have disabled interrupts. */ static void rcu_prepare_for_idle(int cpu) { struct timer_list *tp; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); /* * If this is an idle re-entry, for example, due to use of * RCU_NONIDLE() or the new idle-loop tracing API within the idle * loop, then don't take any state-machine actions, unless the * momentary exit from idle queued additional non-lazy callbacks. - * Instead, repost the rcu_idle_gp_timer if this CPU has callbacks + * Instead, repost the ->idle_gp_timer if this CPU has callbacks * pending. */ - if (!per_cpu(rcu_idle_first_pass, cpu) && - (per_cpu(rcu_nonlazy_posted, cpu) == - per_cpu(rcu_nonlazy_posted_snap, cpu))) { + if (!rdtp->idle_first_pass && + (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) { if (rcu_cpu_has_callbacks(cpu)) { - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); } return; } - per_cpu(rcu_idle_first_pass, cpu) = 0; - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu) - 1; + rdtp->idle_first_pass = 0; + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1; /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. */ if (!rcu_cpu_has_callbacks(cpu)) { - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - per_cpu(rcu_dyntick_drain, cpu) = 0; + rdtp->dyntick_holdoff = jiffies - 1; + rdtp->dyntick_drain = 0; trace_rcu_prep_idle("No callbacks"); return; } @@ -2152,38 +2141,37 @@ static void rcu_prepare_for_idle(int cpu) * If in holdoff mode, just return. We will presumably have * refrained from disabling the scheduling-clock tick. */ - if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { + if (rdtp->dyntick_holdoff == jiffies) { trace_rcu_prep_idle("In holdoff"); return; } - /* Check and update the rcu_dyntick_drain sequencing. */ - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* Check and update the ->dyntick_drain sequencing. */ + if (rdtp->dyntick_drain <= 0) { /* First time through, initialize the counter. */ - per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; - } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && + rdtp->dyntick_drain = RCU_IDLE_FLUSHES; + } else if (rdtp->dyntick_drain <= RCU_IDLE_OPT_FLUSHES && !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_drain = 0; + rdtp->dyntick_holdoff = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) { trace_rcu_prep_idle("Dyntick with callbacks"); - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_GP_DELAY; } else { - per_cpu(rcu_idle_gp_timer_expires, cpu) = + rdtp->idle_gp_timer_expires = jiffies + RCU_IDLE_LAZY_GP_DELAY; trace_rcu_prep_idle("Dyntick with lazy callbacks"); } - tp = &per_cpu(rcu_idle_gp_timer, cpu); - mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); - per_cpu(rcu_nonlazy_posted_snap, cpu) = - per_cpu(rcu_nonlazy_posted, cpu); + tp = &rdtp->idle_gp_timer; + mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); + rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; return; /* Nothing more to do immediately. */ - } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + } else if (--(rdtp->dyntick_drain) <= 0) { /* We have hit the limit, so time to give up. */ - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + rdtp->dyntick_holdoff = jiffies; trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2229,7 +2217,7 @@ static void rcu_prepare_for_idle(int cpu) */ static void rcu_idle_count_callbacks_posted(void) { - __this_cpu_add(rcu_nonlazy_posted, 1); + __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ @@ -2240,11 +2228,12 @@ static void rcu_idle_count_callbacks_posted(void) static void print_cpu_stall_fast_no_hz(char *cp, int cpu) { - struct timer_list *tltp = &per_cpu(rcu_idle_gp_timer, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + struct timer_list *tltp = &rdtp->idle_gp_timer; sprintf(cp, "drain=%d %c timer=%lu", - per_cpu(rcu_dyntick_drain, cpu), - per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + rdtp->dyntick_drain, + rdtp->dyntick_holdoff == jiffies ? 'H' : '.', timer_pending(tltp) ? tltp->expires - jiffies : -1); } -- cgit v1.2.3 From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 16:41:44 -0700 Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick() calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the next wakeup time based on the timer wheels. Only later, when actually entering the idle loop, rcu_prepare_for_idle() will be invoked. In some cases, rcu_prepare_for_idle() will post timers to wake the CPU back up. But all for naught: The next wakeup time for the CPU has already been computed, and posting a timer afterwards does not force that wakeup time to be recomputed. This means that rcu_prepare_for_idle()'s have no effect. This is not a problem on a busy system because something else will wake up the CPU soon enough. However, on lightly loaded systems, the CPU might stay asleep for a considerable length of time. If that CPU has a callback that the rest of the system is waiting on, the system might run very slowly or (in theory) even hang. This commit avoids this problem by having rcu_needs_cpu() give tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU to wake back up, which tick_nohz_stop_sched_tick() takes into account when programming the CPU's wakeup time. An alternative approach is for rcu_prepare_for_idle() to use hrtimers instead of normal timers, but timers are much more efficient than are hrtimers for frequently and repeatedly posting and cancelling a given timer, which is exactly what RCU_FAST_NO_HZ does. Reported-by: Pascal Chapperon Reported-by: Heiko Carstens Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- include/linux/rcutiny.h | 6 +++-- include/linux/rcutree.h | 2 +- kernel/rcutree_plugin.h | 66 +++++++++++++++++++++++++++++++----------------- kernel/time/tick-sched.c | 7 ++++- 4 files changed, 54 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index adb5e5a38cae..854dc4c5c271 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,8 +87,9 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU -static inline int rcu_needs_cpu(int cpu) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return 0; } @@ -96,8 +97,9 @@ static inline int rcu_needs_cpu(int cpu) int rcu_preempt_needs_cpu(void); -static inline int rcu_needs_cpu(int cpu) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_preempt_needs_cpu(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3c6083cde4fc..952b79339304 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -32,7 +32,7 @@ extern void rcu_init(void); extern void rcu_note_context_switch(int cpu); -extern int rcu_needs_cpu(int cpu); +extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies); extern void rcu_cpu_stall_reset(void); /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6bd9637d5d83..5271a020887e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1962,28 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return rdtp->dyntick_holdoff == jiffies; -} - /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2026,6 +2005,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) rcu_preempt_cpu_has_nonlazy_callbacks(cpu); } +/* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + *delta_jiffies = RCU_IDLE_GP_DELAY; + else + *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + return 0; +} + /* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff561..52f5ebbd443b 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off -- cgit v1.2.3 From 6ebb017de9d59a18c3ff9648270e8f6abaa93438 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Tue, 5 Jun 2012 08:52:34 +0200 Subject: printk: Fix alignment of buf causing crash on ARM EABI Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62, printk: convert byte-buffer to variable-length record buffer, causes systems using EABI to crash very early in the boot cycle. The first entry in struct log is a u64, which for EABI must be 8 byte aligned. Make use of __alignof__() so the compiler to decide the alignment, but allow it to be overridden using CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, for systems which can perform unaligned access and want to save a few bytes of space. Tested on Orion5x and Kirkwood. Signed-off-by: Andrew Lunn Tested-by: Stephen Warren Acked-by: Stephen Warren Acked-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 32462d2b364a..f205c25c37e2 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -227,10 +227,10 @@ static u32 clear_idx; #define LOG_LINE_MAX 1024 /* record buffer */ -#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) #define LOG_ALIGN 4 #else -#define LOG_ALIGN 8 +#define LOG_ALIGN __alignof__(struct log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); -- cgit v1.2.3 From 047fe3605235888f3ebcda0c728cb31937eadfe6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 15:24:40 +0200 Subject: splice: fix racy pipe->buffers uses Dave Jones reported a kernel BUG at mm/slub.c:3474! triggered by splice_shrink_spd() called from vmsplice_to_pipe() commit 35f3d14dbbc5 (pipe: add support for shrinking and growing pipes) added capability to adjust pipe->buffers. Problem is some paths don't hold pipe mutex and assume pipe->buffers doesn't change for their duration. Fix this by adding nr_pages_max field in struct splice_pipe_desc, and use it in place of pipe->buffers where appropriate. splice_shrink_spd() loses its struct pipe_inode_info argument. Reported-by: Dave Jones Signed-off-by: Eric Dumazet Cc: Jens Axboe Cc: Alexander Viro Cc: Tom Herbert Cc: stable # 2.6.35 Tested-by: Dave Jones Signed-off-by: Jens Axboe --- fs/splice.c | 35 ++++++++++++++++++++--------------- include/linux/splice.h | 8 ++++---- kernel/relay.c | 5 +++-- kernel/trace/trace.c | 6 ++++-- mm/shmem.c | 3 ++- net/core/skbuff.c | 1 + 6 files changed, 34 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/fs/splice.c b/fs/splice.c index c9f1318a3b82..7bf08fa22ec9 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -273,13 +273,16 @@ void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) * Check if we need to grow the arrays holding pages and partial page * descriptions. */ -int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) { - if (pipe->buffers <= PIPE_DEF_BUFFERS) + unsigned int buffers = ACCESS_ONCE(pipe->buffers); + + spd->nr_pages_max = buffers; + if (buffers <= PIPE_DEF_BUFFERS) return 0; - spd->pages = kmalloc(pipe->buffers * sizeof(struct page *), GFP_KERNEL); - spd->partial = kmalloc(pipe->buffers * sizeof(struct partial_page), GFP_KERNEL); + spd->pages = kmalloc(buffers * sizeof(struct page *), GFP_KERNEL); + spd->partial = kmalloc(buffers * sizeof(struct partial_page), GFP_KERNEL); if (spd->pages && spd->partial) return 0; @@ -289,10 +292,9 @@ int splice_grow_spd(struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) return -ENOMEM; } -void splice_shrink_spd(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +void splice_shrink_spd(struct splice_pipe_desc *spd) { - if (pipe->buffers <= PIPE_DEF_BUFFERS) + if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) return; kfree(spd->pages); @@ -315,6 +317,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -326,7 +329,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, index = *ppos >> PAGE_CACHE_SHIFT; loff = *ppos & ~PAGE_CACHE_MASK; req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - nr_pages = min(req_pages, pipe->buffers); + nr_pages = min(req_pages, spd.nr_pages_max); /* * Lookup the (hopefully) full range of pages we need. @@ -497,7 +500,7 @@ fill_it: if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return error; } @@ -598,6 +601,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &default_pipe_buf_ops, .spd_release = spd_release_page, @@ -608,8 +612,8 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, res = -ENOMEM; vec = __vec; - if (pipe->buffers > PIPE_DEF_BUFFERS) { - vec = kmalloc(pipe->buffers * sizeof(struct iovec), GFP_KERNEL); + if (spd.nr_pages_max > PIPE_DEF_BUFFERS) { + vec = kmalloc(spd.nr_pages_max * sizeof(struct iovec), GFP_KERNEL); if (!vec) goto shrink_ret; } @@ -617,7 +621,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - for (i = 0; i < nr_pages && i < pipe->buffers && len; i++) { + for (i = 0; i < nr_pages && i < spd.nr_pages_max && len; i++) { struct page *page; page = alloc_page(GFP_USER); @@ -665,7 +669,7 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, shrink_ret: if (vec != __vec) kfree(vec); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return res; err: @@ -1614,6 +1618,7 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &user_page_pipe_buf_ops, .spd_release = spd_release_page, @@ -1629,13 +1634,13 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, spd.nr_pages = get_iovec_page_array(iov, nr_segs, spd.pages, spd.partial, false, - pipe->buffers); + spd.nr_pages_max); if (spd.nr_pages <= 0) ret = spd.nr_pages; else ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; } diff --git a/include/linux/splice.h b/include/linux/splice.h index 26e5b613deda..09a545a7dfa3 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -51,7 +51,8 @@ struct partial_page { struct splice_pipe_desc { struct page **pages; /* page map */ struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of pages in map */ + int nr_pages; /* number of populated pages in map */ + unsigned int nr_pages_max; /* pages[] & partial[] arrays size */ unsigned int flags; /* splice flags */ const struct pipe_buf_operations *ops;/* ops associated with output pipe */ void (*spd_release)(struct splice_pipe_desc *, unsigned int); @@ -85,9 +86,8 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, /* * for dynamic pipe sizing */ -extern int splice_grow_spd(struct pipe_inode_info *, struct splice_pipe_desc *); -extern void splice_shrink_spd(struct pipe_inode_info *, - struct splice_pipe_desc *); +extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *); +extern void splice_shrink_spd(struct splice_pipe_desc *); extern void spd_release_page(struct splice_pipe_desc *, unsigned int); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; diff --git a/kernel/relay.c b/kernel/relay.c index ab56a1764d4d..e8cd2027abbd 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1235,6 +1235,7 @@ static ssize_t subbuf_splice_actor(struct file *in, struct splice_pipe_desc spd = { .pages = pages, .nr_pages = 0, + .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, .flags = flags, .ops = &relay_pipe_buf_ops, @@ -1302,8 +1303,8 @@ static ssize_t subbuf_splice_actor(struct file *in, ret += padding; out: - splice_shrink_spd(pipe, &spd); - return ret; + splice_shrink_spd(&spd); + return ret; } static ssize_t relay_file_splice_read(struct file *in, diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 68032c6177db..288488082224 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3609,6 +3609,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .pages = pages_def, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, @@ -3680,7 +3681,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); return ret; out_err: @@ -4231,6 +4232,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages_def, .partial = partial_def, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, @@ -4318,7 +4320,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } ret = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); out: return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 585bd220a21e..c244e93a70fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1577,6 +1577,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = PIPE_DEF_BUFFERS, .flags = flags, .ops = &page_cache_pipe_buf_ops, .spd_release = spd_release_page, @@ -1665,7 +1666,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (spd.nr_pages) error = splice_to_pipe(pipe, &spd); - splice_shrink_spd(pipe, &spd); + splice_shrink_spd(&spd); if (error > 0) { *ppos += error; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 016694d62484..bac3c5756d63 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1755,6 +1755,7 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset, struct splice_pipe_desc spd = { .pages = pages, .partial = partial, + .nr_pages_max = MAX_SKB_FRAGS, .flags = flags, .ops = &sock_pipe_buf_ops, .spd_release = sock_spd_release, -- cgit v1.2.3 From a70270468234749741c5893ae78e5bb524771402 Mon Sep 17 00:00:00 2001 From: Don Zickus Date: Wed, 13 Jun 2012 09:35:48 -0400 Subject: watchdog: Quiet down the boot messages A bunch of bugzillas have complained how noisy the nmi_watchdog is during boot-up especially with its expected failure cases (like virt and bios resource contention). This is my attempt to quiet them down and keep it less confusing for the end user. What I did is print the message for cpu0 and save it for future comparisons. If future cpus have an identical message as cpu0, then don't print the redundant info. However, if a future cpu has a different message, happily print that loudly. Before the change, you would see something like: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz stepping 0a Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver. ... version: 2 ... bit width: 40 ... generic registers: 2 ... value mask: 000000ffffffffff ... max period: 000000007fffffff ... fixed-purpose events: 3 ... event mask: 0000000700000003 NMI watchdog enabled, takes one hw-pmu counter. Booting Node 0, Processors #1 NMI watchdog enabled, takes one hw-pmu counter. #2 NMI watchdog enabled, takes one hw-pmu counter. #3 Ok. NMI watchdog enabled, takes one hw-pmu counter. Brought up 4 CPUs Total of 4 processors activated (22607.24 BogoMIPS). After the change, it is simplified to: ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1 CPU0: Intel(R) Core(TM)2 Quad CPU Q9550 @ 2.83GHz stepping 0a Performance Events: PEBS fmt0+, Core2 events, Intel PMU driver. ... version: 2 ... bit width: 40 ... generic registers: 2 ... value mask: 000000ffffffffff ... max period: 000000007fffffff ... fixed-purpose events: 3 ... event mask: 0000000700000003 NMI watchdog: enabled on all CPUs, permanently consumes one hw-PMU counter. Booting Node 0, Processors #1 #2 #3 Ok. Brought up 4 CPUs V2: little changes based on Joe Perches' feedback V3: printk cleanup based on Ingo's feedback; checkpatch fix V4: keep printk as one long line V5: Ingo fix ups Reported-and-tested-by: Nathan Zimmer Signed-off-by: Don Zickus Cc: nzimmer@sgi.com Cc: joe@perches.com Link: http://lkml.kernel.org/r/1339594548-17227-1-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index e5e1d85b8c7c..4b1dfba70f7c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -372,6 +372,13 @@ static int watchdog(void *unused) #ifdef CONFIG_HARDLOCKUP_DETECTOR +/* + * People like the simple clean cpu node info on boot. + * Reduce the watchdog noise by only printing messages + * that are different from what cpu0 displayed. + */ +static unsigned long cpu0_err; + static int watchdog_nmi_enable(int cpu) { struct perf_event_attr *wd_attr; @@ -390,11 +397,21 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); + + /* save cpu0 error for future comparision */ + if (cpu == 0 && IS_ERR(event)) + cpu0_err = PTR_ERR(event); + if (!IS_ERR(event)) { - pr_info("enabled, takes one hw-pmu counter.\n"); + /* only print for cpu0 or different than cpu0 */ + if (cpu == 0 || cpu0_err) + pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n"); goto out_save; } + /* skip displaying the same error again */ + if (cpu > 0 && (PTR_ERR(event) == cpu0_err)) + return PTR_ERR(event); /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) -- cgit v1.2.3 From e2ae715d66bf4becfb85eb84b7150e23cf27df30 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 15 Jun 2012 14:07:51 +0200 Subject: kmsg - kmsg_dump() use iterator to receive log buffer content Provide an iterator to receive the log buffer content, and convert all kmsg_dump() users to it. The structured data in the kmsg buffer now contains binary data, which should no longer be copied verbatim to the kmsg_dump() users. The iterator should provide reliable access to the buffer data, and also supports proper log line-aware chunking of data while iterating. Signed-off-by: Kay Sievers Tested-by: Tony Luck Reported-by: Anton Vorontsov Tested-by: Anton Vorontsov Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/nvram.c | 61 +------- arch/x86/platform/mrst/early_printk_mrst.c | 13 +- drivers/mtd/mtdoops.c | 22 +-- fs/pstore/platform.c | 34 ++--- include/linux/kmsg_dump.h | 45 +++++- kernel/printk.c | 220 +++++++++++++++++++++++++---- 6 files changed, 258 insertions(+), 137 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c index 36f957f31842..8733a86ad52e 100644 --- a/arch/powerpc/platforms/pseries/nvram.c +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -68,9 +68,7 @@ static const char *pseries_nvram_os_partitions[] = { }; static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *old_msgs, unsigned long old_len, - const char *new_msgs, unsigned long new_len); + enum kmsg_dump_reason reason); static struct kmsg_dumper nvram_kmsg_dumper = { .dump = oops_to_nvram @@ -503,28 +501,6 @@ int __init pSeries_nvram_init(void) return 0; } -/* - * Try to capture the last capture_len bytes of the printk buffer. Return - * the amount actually captured. - */ -static size_t capture_last_msgs(const char *old_msgs, size_t old_len, - const char *new_msgs, size_t new_len, - char *captured, size_t capture_len) -{ - if (new_len >= capture_len) { - memcpy(captured, new_msgs + (new_len - capture_len), - capture_len); - return capture_len; - } else { - /* Grab the end of old_msgs. */ - size_t old_tail_len = min(old_len, capture_len - new_len); - memcpy(captured, old_msgs + (old_len - old_tail_len), - old_tail_len); - memcpy(captured + old_tail_len, new_msgs, new_len); - return old_tail_len + new_len; - } -} - /* * Are we using the ibm,rtas-log for oops/panic reports? And if so, * would logging this oops/panic overwrite an RTAS event that rtas_errd @@ -541,27 +517,6 @@ static int clobbering_unread_rtas_event(void) NVRAM_RTAS_READ_TIMEOUT); } -/* Squeeze out each line's severity prefix. */ -static size_t elide_severities(char *buf, size_t len) -{ - char *in, *out, *buf_end = buf + len; - /* Assume a at the very beginning marks the start of a line. */ - int newline = 1; - - in = out = buf; - while (in < buf_end) { - if (newline && in+3 <= buf_end && - *in == '<' && isdigit(in[1]) && in[2] == '>') { - in += 3; - newline = 0; - } else { - newline = (*in == '\n'); - *out++ = *in++; - } - } - return out - buf; -} - /* Derived from logfs_compress() */ static int nvram_compress(const void *in, void *out, size_t inlen, size_t outlen) @@ -619,9 +574,7 @@ static int zip_oops(size_t text_len) * partition. If that's too much, go back and capture uncompressed text. */ static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *old_msgs, unsigned long old_len, - const char *new_msgs, unsigned long new_len) + enum kmsg_dump_reason reason) { static unsigned int oops_count = 0; static bool panicking = false; @@ -660,14 +613,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, return; if (big_oops_buf) { - text_len = capture_last_msgs(old_msgs, old_len, - new_msgs, new_len, big_oops_buf, big_oops_buf_sz); - text_len = elide_severities(big_oops_buf, text_len); + kmsg_dump_get_buffer(dumper, false, + big_oops_buf, big_oops_buf_sz, &text_len); rc = zip_oops(text_len); } if (rc != 0) { - text_len = capture_last_msgs(old_msgs, old_len, - new_msgs, new_len, oops_data, oops_data_sz); + kmsg_dump_rewind(dumper); + kmsg_dump_get_buffer(dumper, true, + oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; *oops_len = (u16) text_len; } diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c index 3c6e328483c7..028454f0c3a5 100644 --- a/arch/x86/platform/mrst/early_printk_mrst.c +++ b/arch/x86/platform/mrst/early_printk_mrst.c @@ -110,19 +110,16 @@ static struct kmsg_dumper dw_dumper; static int dumper_registered; static void dw_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { - int i; + static char line[1024]; + size_t len; /* When run to this, we'd better re-init the HW */ mrst_early_console_init(); - for (i = 0; i < l1; i++) - early_mrst_console.write(&early_mrst_console, s1 + i, 1); - for (i = 0; i < l2; i++) - early_mrst_console.write(&early_mrst_console, s2 + i, 1); + while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) + early_mrst_console.write(&early_mrst_console, line, len); } /* Set the ratio rate to 115200, 8n1, IRQ disabled */ diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index ae36d7e1e913..551e316e4454 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -304,32 +304,17 @@ static void find_next_position(struct mtdoops_context *cxt) } static void mtdoops_do_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { struct mtdoops_context *cxt = container_of(dumper, struct mtdoops_context, dump); - unsigned long s1_start, s2_start; - unsigned long l1_cpy, l2_cpy; - char *dst; - - if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC) - return; /* Only dump oopses if dump_oops is set */ if (reason == KMSG_DUMP_OOPS && !dump_oops) return; - dst = cxt->oops_buf + MTDOOPS_HEADER_SIZE; /* Skip the header */ - l2_cpy = min(l2, record_size - MTDOOPS_HEADER_SIZE); - l1_cpy = min(l1, record_size - MTDOOPS_HEADER_SIZE - l2_cpy); - - s2_start = l2 - l2_cpy; - s1_start = l1 - l1_cpy; - - memcpy(dst, s1 + s1_start, l1_cpy); - memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); + kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, + record_size - MTDOOPS_HEADER_SIZE, NULL); /* Panics must be written immediately */ if (reason != KMSG_DUMP_OOPS) @@ -375,6 +360,7 @@ static void mtdoops_notify_add(struct mtd_info *mtd) return; } + cxt->dump.max_reason = KMSG_DUMP_OOPS; cxt->dump.dump = mtdoops_do_dump; err = kmsg_dump_register(&cxt->dump); if (err) { diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 82c585f715e3..03ce7a9b81cc 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -94,20 +94,15 @@ static const char *get_reason_str(enum kmsg_dump_reason reason) * as we can from the end of the buffer. */ static void pstore_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2) + enum kmsg_dump_reason reason) { - unsigned long s1_start, s2_start; - unsigned long l1_cpy, l2_cpy; - unsigned long size, total = 0; - char *dst; + unsigned long total = 0; const char *why; u64 id; - int hsize, ret; unsigned int part = 1; unsigned long flags = 0; int is_locked = 0; + int ret; why = get_reason_str(reason); @@ -119,30 +114,25 @@ static void pstore_dump(struct kmsg_dumper *dumper, spin_lock_irqsave(&psinfo->buf_lock, flags); oopscount++; while (total < kmsg_bytes) { + char *dst; + unsigned long size; + int hsize; + size_t len; + dst = psinfo->buf; hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount, part); size = psinfo->bufsize - hsize; dst += hsize; - l2_cpy = min(l2, size); - l1_cpy = min(l1, size - l2_cpy); - - if (l1_cpy + l2_cpy == 0) + if (!kmsg_dump_get_buffer(dumper, true, dst, size, &len)) break; - s2_start = l2 - l2_cpy; - s1_start = l1 - l1_cpy; - - memcpy(dst, s1 + s1_start, l1_cpy); - memcpy(dst + l1_cpy, s2 + s2_start, l2_cpy); - ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - hsize + l1_cpy + l2_cpy, psinfo); + hsize + len, psinfo); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - l1 -= l1_cpy; - l2 -= l2_cpy; - total += l1_cpy + l2_cpy; + + total += hsize + len; part++; } if (in_nmi()) { diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 35f7237ec972..af4eb5a39d9a 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -21,6 +21,7 @@ * is passed to the kernel. */ enum kmsg_dump_reason { + KMSG_DUMP_UNDEF, KMSG_DUMP_PANIC, KMSG_DUMP_OOPS, KMSG_DUMP_EMERG, @@ -31,23 +32,37 @@ enum kmsg_dump_reason { /** * struct kmsg_dumper - kernel crash message dumper structure - * @dump: The callback which gets called on crashes. The buffer is passed - * as two sections, where s1 (length l1) contains the older - * messages and s2 (length l2) contains the newer. * @list: Entry in the dumper list (private) + * @dump: Call into dumping code which will retrieve the data with + * through the record iterator + * @max_reason: filter for highest reason number that should be dumped * @registered: Flag that specifies if this is already registered */ struct kmsg_dumper { - void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, - const char *s1, unsigned long l1, - const char *s2, unsigned long l2); struct list_head list; - int registered; + void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); + enum kmsg_dump_reason max_reason; + bool active; + bool registered; + + /* private state of the kmsg iterator */ + u32 cur_idx; + u32 next_idx; + u64 cur_seq; + u64 next_seq; }; #ifdef CONFIG_PRINTK void kmsg_dump(enum kmsg_dump_reason reason); +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len); + +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len); + +void kmsg_dump_rewind(struct kmsg_dumper *dumper); + int kmsg_dump_register(struct kmsg_dumper *dumper); int kmsg_dump_unregister(struct kmsg_dumper *dumper); @@ -56,6 +71,22 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) { } +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + const char *line, size_t size, size_t *len) +{ + return false; +} + +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + return false; +} + +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ +} + static inline int kmsg_dump_register(struct kmsg_dumper *dumper) { return -EINVAL; diff --git a/kernel/printk.c b/kernel/printk.c index f205c25c37e2..ceb4a2f775a1 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -909,7 +909,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. - */ + */ seq = clear_seq; idx = clear_idx; while (seq < log_next_seq) { @@ -919,6 +919,8 @@ static int syslog_print_all(char __user *buf, int size, bool clear) idx = log_next(idx); seq++; } + + /* move first record forward until length fits into the buffer */ seq = clear_seq; idx = clear_idx; while (len > size && seq < log_next_seq) { @@ -929,7 +931,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear) seq++; } - /* last message in this dump */ + /* last message fitting into this dump */ next_seq = log_next_seq; len = 0; @@ -2300,48 +2302,210 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); * kmsg_dump - dump kernel log to kernel message dumpers. * @reason: the reason (oops, panic etc) for dumping * - * Iterate through each of the dump devices and call the oops/panic - * callbacks with the log buffer. + * Call each of the registered dumper's dump() callback, which can + * retrieve the kmsg records with kmsg_dump_get_line() or + * kmsg_dump_get_buffer(). */ void kmsg_dump(enum kmsg_dump_reason reason) { - u64 idx; struct kmsg_dumper *dumper; - const char *s1, *s2; - unsigned long l1, l2; unsigned long flags; if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) return; - /* Theoretically, the log could move on after we do this, but - there's not a lot we can do about that. The new messages - will overwrite the start of what we dump. */ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + if (dumper->max_reason && reason > dumper->max_reason) + continue; + + /* initialize iterator with data about the stored records */ + dumper->active = true; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + + /* invoke dumper which will iterate over records */ + dumper->dump(dumper, reason); + + /* reset iterator */ + dumper->active = false; + } + rcu_read_unlock(); +} + +/** + * kmsg_dump_get_line - retrieve one kmsg log line + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the beginning of the kmsg buffer, with the oldest kmsg + * record, and copy one record into the provided buffer. + * + * Consecutive calls will return the next available record moving + * towards the end of the buffer with the youngest messages. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, + char *line, size_t size, size_t *len) +{ + unsigned long flags; + struct log *msg; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; raw_spin_lock_irqsave(&logbuf_lock, flags); - if (syslog_seq < log_first_seq) - idx = syslog_idx; - else - idx = log_first_idx; + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } - if (idx > log_next_idx) { - s1 = log_buf; - l1 = log_next_idx; + /* last entry */ + if (dumper->cur_seq >= log_next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } - s2 = log_buf + idx; - l2 = log_buf_len - idx; - } else { - s1 = ""; - l1 = 0; + msg = log_from_idx(dumper->cur_idx); + l = msg_print_text(msg, syslog, + line, size); + + dumper->cur_idx = log_next(dumper->cur_idx); + dumper->cur_seq++; + ret = true; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_line); + +/** + * kmsg_dump_get_buffer - copy kmsg log lines + * @dumper: registered kmsg dumper + * @syslog: include the "<4>" prefixes + * @line: buffer to copy the line to + * @size: maximum size of the buffer + * @len: length of line placed into buffer + * + * Start at the end of the kmsg buffer and fill the provided buffer + * with as many of the the *youngest* kmsg records that fit into it. + * If the buffer is large enough, all available kmsg records will be + * copied with a single call. + * + * Consecutive calls will fill the buffer with the next block of + * available older records, not including the earlier retrieved ones. + * + * A return value of FALSE indicates that there are no more records to + * read. + */ +bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, + char *buf, size_t size, size_t *len) +{ + unsigned long flags; + u64 seq; + u32 idx; + u64 next_seq; + u32 next_idx; + size_t l = 0; + bool ret = false; + + if (!dumper->active) + goto out; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (dumper->cur_seq < log_first_seq) { + /* messages are gone, move to first available one */ + dumper->cur_seq = log_first_seq; + dumper->cur_idx = log_first_idx; + } + + /* last entry */ + if (dumper->cur_seq >= dumper->next_seq) { + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + goto out; + } + + /* calculate length of entire buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; + } + + /* move first record forward until length fits into the buffer */ + seq = dumper->cur_seq; + idx = dumper->cur_idx; + while (l > size && seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); - s2 = log_buf + idx; - l2 = log_next_idx - idx; + l -= msg_print_text(msg, true, NULL, 0); + idx = log_next(idx); + seq++; } + + /* last message in next interation */ + next_seq = seq; + next_idx = idx; + + l = 0; + while (seq < dumper->next_seq) { + struct log *msg = log_from_idx(idx); + + l += msg_print_text(msg, syslog, + buf + l, size - l); + + idx = log_next(idx); + seq++; + } + + dumper->next_seq = next_seq; + dumper->next_idx = next_idx; + ret = true; raw_spin_unlock_irqrestore(&logbuf_lock, flags); +out: + if (len) + *len = l; + return ret; +} +EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); - rcu_read_lock(); - list_for_each_entry_rcu(dumper, &dump_list, list) - dumper->dump(dumper, reason, s1, l1, s2, l2); - rcu_read_unlock(); +/** + * kmsg_dump_rewind - reset the interator + * @dumper: registered kmsg dumper + * + * Reset the dumper's iterator so that kmsg_dump_get_line() and + * kmsg_dump_get_buffer() can be called again and used multiple + * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper *dumper) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&logbuf_lock, flags); + dumper->cur_seq = clear_seq; + dumper->cur_idx = clear_idx; + dumper->next_seq = log_next_seq; + dumper->next_idx = log_next_idx; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); } +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); #endif -- cgit v1.2.3 From 4a77a5a06ec66ed05199b301e7c25f42f979afdc Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Sat, 16 Jun 2012 21:21:51 +0800 Subject: printk: use mutex lock to stop syslog_seq from going wild Although syslog_seq and log_next_seq stuff are protected by logbuf_lock spin log, it's not enough. Say we have two processes A and B, and let syslog_seq = N, while log_next_seq = N + 1, and the two processes both come to syslog_print at almost the same time. And No matter which process get the spin lock first, it will increase syslog_seq by one, then release spin lock; thus later, another process increase syslog_seq by one again. In this case, syslog_seq is bigger than syslog_next_seq. And latter, it would make: wait_event_interruptiable(log_wait, syslog != log_next_seq) don't wait any more even there is no new write comes. Thus it introduce a infinite loop reading. I can easily see this kind of issue by the following steps: # cat /proc/kmsg # at meantime, I don't kill rsyslog # So they are the two processes. # xinit # I added drm.debug=6 in the kernel parameter line, # so that it will produce lots of message and let that # issue happen It's 100% reproducable on my side. And my disk will be filled up by /var/log/messages in a quite short time. So, introduce a mutex_lock to stop syslog_seq from going wild just like what devkmsg_read() does. It does fix this issue as expected. v2: use mutex_lock_interruptiable() instead (comments from Kay) Signed-off-by: Yuanhan Liu Reviewed-by: Fengguang Wu Acked-By: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ceb4a2f775a1..572730bd8a5c 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -414,7 +414,9 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (!user) return -EBADF; - mutex_lock(&user->lock); + ret = mutex_lock_interruptible(&user->lock); + if (ret) + return ret; raw_spin_lock(&logbuf_lock); while (user->seq == log_next_seq) { if (file->f_flags & O_NONBLOCK) { @@ -976,6 +978,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) { bool clear = false; static int saved_console_loglevel = -1; + static DEFINE_MUTEX(syslog_mutex); int error; error = check_syslog_permissions(type, from_file); @@ -1002,11 +1005,17 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) error = -EFAULT; goto out; } + error = mutex_lock_interruptible(&syslog_mutex); + if (error) + goto out; error = wait_event_interruptible(log_wait, syslog_seq != log_next_seq); - if (error) + if (error) { + mutex_unlock(&syslog_mutex); goto out; + } error = syslog_print(buf, len); + mutex_unlock(&syslog_mutex); break; /* Read/clear last kernel messages */ case SYSLOG_ACTION_READ_CLEAR: -- cgit v1.2.3 From b56a39ac263e5b8cafedd551a49c2105e68b98c2 Mon Sep 17 00:00:00 2001 From: Yuanhan Liu Date: Sat, 16 Jun 2012 12:40:55 +0800 Subject: printk: return -EINVAL if the message len is bigger than the buf size Just like what devkmsg_read() does, return -EINVAL if the message len is bigger than the buf size, or it will trigger a segfault error. Acked-by: Kay Sievers Acked-by: Fengguang Wu Signed-off-by: Yuanhan Liu Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 572730bd8a5c..a2276b916769 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -880,7 +880,9 @@ static int syslog_print(char __user *buf, int size) syslog_seq++; raw_spin_unlock_irq(&logbuf_lock); - if (len > 0 && copy_to_user(buf, text, len)) + if (len > size) + len = -EINVAL; + else if (len > 0 && copy_to_user(buf, text, len)) len = -EFAULT; kfree(text); -- cgit v1.2.3 From 9c5da09d266ca9b32eb16cf940f8161d949c2fe5 Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 14 Jun 2012 15:31:09 -0700 Subject: perf: Use css_tryget() to avoid propping up css refcount An rmdir pushes css's ref count to zero. However, if the associated directory is open at the time, the dentry ref count is non-zero. If the fd for this directory is then passed into perf_event_open, it does a css_get(). This bounces the ref count back up from zero. This is a problem by itself. But what makes it turn into a crash is the fact that we end up doing an extra dput, since we perform a dput when css_put sees the ref count go down to zero. css_tryget() does not fall into that trap. So, we use that instead. Reproduction test-case for the bug: #include #include #include #include #include #include #include #include #include #define PERF_FLAG_PID_CGROUP (1U << 2) int perf_event_open(struct perf_event_attr *hw_event_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open,hw_event_uptr, pid, cpu, group_fd, flags); } /* * Directly poke at the perf_event bug, since it's proving hard to repro * depending on where in the kernel tree. what moved? */ int main(int argc, char **argv) { int fd; struct perf_event_attr attr; memset(&attr, 0, sizeof(attr)); attr.exclude_kernel = 1; attr.size = sizeof(attr); mkdir("/dev/cgroup/perf_event/blah", 0777); fd = open("/dev/cgroup/perf_event/blah", O_RDONLY); perror("open"); rmdir("/dev/cgroup/perf_event/blah"); sleep(2); perf_event_open(&attr, fd, 0, -1, PERF_FLAG_PID_CGROUP); perror("perf_event_open"); close(fd); return 0; } Signed-off-by: Salman Qazi Signed-off-by: Peter Zijlstra Acked-by: Tejun Heo Link: http://lkml.kernel.org/r/20120614223108.1025.2503.stgit@dungbeetle.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f85c0154b333..d7d71d6ec972 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -253,9 +253,9 @@ perf_cgroup_match(struct perf_event *event) return !event->cgrp || event->cgrp == cpuctx->cgrp; } -static inline void perf_get_cgroup(struct perf_event *event) +static inline bool perf_tryget_cgroup(struct perf_event *event) { - css_get(&event->cgrp->css); + return css_tryget(&event->cgrp->css); } static inline void perf_put_cgroup(struct perf_event *event) @@ -484,7 +484,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, event->cgrp = cgrp; /* must be done before we fput() the file */ - perf_get_cgroup(event); + if (!perf_tryget_cgroup(event)) { + event->cgrp = NULL; + ret = -ENOENT; + goto out; + } /* * all events in a group must monitor -- cgit v1.2.3 From 8e3bbf42c6d73881956863cc3305456afe2bc4ea Mon Sep 17 00:00:00 2001 From: Salman Qazi Date: Thu, 14 Jun 2012 14:55:30 -0700 Subject: cgroups: Account for CSS_DEACT_BIAS in __css_put When we fixed the race between atomic_dec and css_refcnt, we missed the fact that css_refcnt internally subtracts CSS_DEACT_BIAS to get the actual reference count. This can potentially cause a refcount leak if __css_put races with cgroup_clear_css_refs. Signed-off-by: Salman Qazi Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ceeafe874b3f..2097684cf194 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -255,12 +255,17 @@ int cgroup_lock_is_held(void) EXPORT_SYMBOL_GPL(cgroup_lock_is_held); +static int css_unbias_refcnt(int refcnt) +{ + return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; +} + /* the current nr of refs, always >= 0 whether @css is deactivated or not */ static int css_refcnt(struct cgroup_subsys_state *css) { int v = atomic_read(&css->refcnt); - return v >= 0 ? v : v - CSS_DEACT_BIAS; + return css_unbias_refcnt(v); } /* convenient tests for these bits */ @@ -4982,9 +4987,12 @@ EXPORT_SYMBOL_GPL(__css_tryget); void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int v; rcu_read_lock(); - switch (atomic_dec_return(&css->refcnt)) { + v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); + + switch (v) { case 1: if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.2.3 From 4fe7efdbdfb1c7e7a7f31decfd831c0f31d37091 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 20 Jun 2012 12:53:01 -0700 Subject: mm: correctly synchronize rss-counters at exit/exec do_exit() and exec_mmap() call sync_mm_rss() before mm_release() does put_user(clear_child_tid) which can update task->rss_stat and thus make mm->rss_stat inconsistent. This triggers the "BUG:" printk in check_mm(). Let's fix this bug in the safest way, and optimize/cleanup this later. Reported-by: Markus Trippelsdorf Signed-off-by: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- kernel/exit.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index a79786a8d2c8..da27b91ff1e8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -819,10 +819,10 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { + sync_mm_rss(old_mm); /* * Make sure that if there is a core dump in progress * for the old mm, we get out and die instead of going diff --git a/kernel/exit.c b/kernel/exit.c index 34867cc5b42a..c0277d3f1aaa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -643,6 +643,7 @@ static void exit_mm(struct task_struct * tsk) mm_release(tsk, mm); if (!mm) return; + sync_mm_rss(mm); /* * Serialize with any possible pending coredump. * We must hold mmap_sem around checking core_state -- cgit v1.2.3 From 6347e90091041e34bea625370794c92f4ce71228 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Jun 2012 12:53:03 -0700 Subject: pidns: guarantee that the pidns init will be the last pidns process reaped Today we have a twofold bug. Sometimes release_task on pid == 1 in a pid namespace can run before other processes in a pid namespace have had release task called. With the result that pid_ns_release_proc can be called before the last proc_flus_task() is done using upid->ns->proc_mnt, resulting in the use of a stale pointer. This same set of circumstances can lead to waitpid(...) returning for a processes started with clone(CLONE_NEWPID) before the every process in the pid namespace has actually exited. To fix this modify zap_pid_ns_processess wait until all other processes in the pid namespace have exited, even EXIT_DEAD zombies. The delay_group_leader and related tests ensure that the thread gruop leader will be the last thread of a process group to be reaped, or to become EXIT_DEAD and self reap. With the change to zap_pid_ns_processes we get the guarantee that pid == 1 in a pid namespace will be the last task that release_task is called on. With pid == 1 being the last task to pass through release_task pid_ns_release_proc can no longer be called too early nor can wait return before all of the EXIT_DEAD tasks in a pid namespace have exited. Signed-off-by: Eric W. Biederman Signed-off-by: Oleg Nesterov Cc: Louis Rilling Cc: Mike Galbraith Acked-by: Pavel Emelyanov Tested-by: Andrew Wagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 +++++++++++++- kernel/pid_namespace.c | 20 ++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index c0277d3f1aaa..a85efd2348bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; - detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead) list_del_rcu(&p->tasks); list_del_init(&p->sibling); __this_cpu_dec(process_counts); + /* + * If we are the last child process in a pid namespace to be + * reaped, notify the reaper sleeping zap_pid_ns_processes(). + */ + if (IS_ENABLED(CONFIG_PID_NS)) { + struct task_struct *parent = p->real_parent; + + if ((task_active_pid_ns(p)->child_reaper == parent) && + list_empty(&parent->children) && + (parent->flags & PF_EXITING)) + wake_up_process(parent); + } } + detach_pid(p, PIDTYPE_PID); list_del_rcu(&p->thread_group); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 16b20e38c4a1..b3c7fd554250 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,11 +184,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } read_unlock(&tasklist_lock); + /* Firstly reap the EXIT_ZOMBIE children we may have. */ do { clear_thread_flag(TIF_SIGPENDING); rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + /* + * sys_wait4() above can't reap the TASK_DEAD children. + * Make sure they all go away, see __unhash_process(). + */ + for (;;) { + bool need_wait = false; + + read_lock(&tasklist_lock); + if (!list_empty(¤t->children)) { + __set_current_state(TASK_UNINTERRUPTIBLE); + need_wait = true; + } + read_unlock(&tasklist_lock); + + if (!need_wait) + break; + schedule(); + } + if (pid_ns->reboot) current->signal->group_exit_code = pid_ns->reboot; -- cgit v1.2.3 From 50d75f8daead8a1f850c40a3b6c6575ab19b48cf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 20 Jun 2012 12:53:04 -0700 Subject: pidns: find_new_reaper() can no longer switch to init_pid_ns.child_reaper find_new_reaper() changes pid_ns->child_reaper, see add0d4df ("pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing"). The original reason has gone away after the previous patch, ->children list must be empty after zap_pid_ns_processes(). However now we can not switch to init_pid_ns.child_reaper. __unhash_process() relies on the "->child_reaper == parent" check, but this check does not work if the last exiting task is also the child reaper. As Eric sugested, we can change __unhash_process() to use the parent's pid_ns and remove this code. Also, with this change we can move detach_pid(PIDTYPE_PID) back, where it was before the previous fix. Signed-off-by: Oleg Nesterov Acked-by: "Eric W. Biederman" Cc: Louis Rilling Cc: Mike Galbraith Acked-by: Pavel Emelyanov Tested-by: Andrew Wagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index a85efd2348bd..2f59cc334516 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; + detach_pid(p, PIDTYPE_PID); if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -78,13 +79,12 @@ static void __unhash_process(struct task_struct *p, bool group_dead) if (IS_ENABLED(CONFIG_PID_NS)) { struct task_struct *parent = p->real_parent; - if ((task_active_pid_ns(p)->child_reaper == parent) && + if ((task_active_pid_ns(parent)->child_reaper == parent) && list_empty(&parent->children) && (parent->flags & PF_EXITING)) wake_up_process(parent); } } - detach_pid(p, PIDTYPE_PID); list_del_rcu(&p->thread_group); } @@ -732,12 +732,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father) zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); - /* - * We can not clear ->child_reaper or leave it alone. - * There may by stealth EXIT_DEAD tasks on ->children, - * forget_original_parent() must move them somewhere. - */ - pid_ns->child_reaper = init_pid_ns.child_reaper; } else if (father->signal->has_child_subreaper) { struct task_struct *reaper; -- cgit v1.2.3 From 5702c5eeab959e86ee2d9b4fe7f2d87e65b25d46 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 20 Jun 2012 12:53:04 -0700 Subject: c/r: prctl: Move PR_GET_TID_ADDRESS to a proper place During merging of PR_GET_TID_ADDRESS patch the code has been misplaced (it happened to appear under PR_MCE_KILL) in result noone can use this option. Fix it by moving code snippet to a proper place. Signed-off-by: Cyrill Gorcunov Acked-by: Kees Cook Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Andrey Vagin Cc: Serge Hallyn Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index f0ec44dcd415..e0c8ffc50d7f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2127,9 +2127,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_TID_ADDRESS: - error = prctl_get_tid_address(me, (int __user **)arg2); - break; default: return -EINVAL; } @@ -2147,6 +2144,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; error = 0; -- cgit v1.2.3 From 4661e3568a7d14a93d4e428d246cdb86f4bac6e7 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Jun 2012 17:12:19 -0400 Subject: printk: fix regression in SYSLOG_ACTION_CLEAR Commit 7ff9554bb578ba02166071d2d487b7fc7d860d62 (printk: convert byte-buffer to variable-length record buffer) introduced a regression by accidentally removing a "break" statement from inside the big switch in printk's do_syslog(). The symptom of this bug is that the "dmesg -C" command doesn't only clear the kernel's log buffer; it also disables console logging. This patch (as1561) fixes the regression by adding the missing "break". Signed-off-by: Alan Stern CC: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index a2276b916769..d6a1412f6b09 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1040,6 +1040,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) /* Clear ring buffer */ case SYSLOG_ACTION_CLEAR: syslog_print_all(NULL, 0, true); + break; /* Disable logging to console */ case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) -- cgit v1.2.3 From b41772abebc27c61dd578b76da99aa5240b4c99a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 21 Jun 2012 20:50:42 -0700 Subject: rcu: Stop rcu_do_batch() from multiplexing the "count" variable Commit b1420f1c (Make rcu_barrier() less disruptive) rearranged the code in rcu_do_batch(), moving the ->qlen manipulation to follow the requeueing of the callbacks. Unfortunately, this rearrangement clobbered the value of the "count" local variable before the value of rdp->qlen was adjusted, resulting in the value of rdp->qlen being inaccurate. This commit therefore introduces an index variable "i", avoiding the inadvertent multiplexing. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3b0f1337f75b..38ecdda3f55f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1530,7 +1530,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count, count_lazy; + int bl, count, count_lazy, i; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1553,9 +1553,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; tail = rdp->nxttail[RCU_DONE_TAIL]; - for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) - if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[count] = &rdp->nxtlist; + for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) + if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[i] = &rdp->nxtlist; local_irq_restore(flags); /* Invoke callbacks. */ @@ -1583,9 +1583,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) if (list != NULL) { *tail = rdp->nxtlist; rdp->nxtlist = list; - for (count = 0; count < RCU_NEXT_SIZE; count++) - if (&rdp->nxtlist == rdp->nxttail[count]) - rdp->nxttail[count] = tail; + for (i = 0; i < RCU_NEXT_SIZE; i++) + if (&rdp->nxtlist == rdp->nxttail[i]) + rdp->nxttail[i] = tail; else break; } -- cgit v1.2.3 From 6fda135c908d0f38a0167adcbd71094572e3059b Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 26 Jun 2012 12:35:24 -0700 Subject: Revert "printk: return -EINVAL if the message len is bigger than the buf size" This reverts commit b56a39ac263e5b8cafedd551a49c2105e68b98c2. A better patch from Jan will follow this to resolve the issue. Acked-by: Kay Sievers Cc: Fengguang Wu Cc: Yuanhan Liu Cc: Jan Beulich Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index d6a1412f6b09..ff05361962e1 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -880,9 +880,7 @@ static int syslog_print(char __user *buf, int size) syslog_seq++; raw_spin_unlock_irq(&logbuf_lock); - if (len > size) - len = -EINVAL; - else if (len > 0 && copy_to_user(buf, text, len)) + if (len > 0 && copy_to_user(buf, text, len)) len = -EFAULT; kfree(text); -- cgit v1.2.3 From 116e90b23f74d303e8d607c7a7d54f60f14ab9f2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 22 Jun 2012 16:36:09 +0100 Subject: syslog: fill buffer with more than a single message for SYSLOG_ACTION_READ The recent changes to the printk buffer management resulted in SYSLOG_ACTION_READ to only return a single message, whereas previously the buffer would get filled as much as possible. As, when too small to fit everything, filling it to the last byte would be pretty ugly with the new code, the patch arranges for as many messages as possible to get returned in a single invocation. User space tools in at least all SLES versions depend on the old behavior. This at once addresses the issue attempted to get fixed with commit b56a39ac263e5b8cafedd551a49c2105e68b98c2 ("printk: return -EINVAL if the message len is bigger than the buf size"), and since that commit widened the possibility for losing a message altogether, the patch here assumes that this other commit would get reverted first (otherwise the patch here won't apply). Furthermore, this patch also addresses the problem dealt with in commit 4a77a5a06ec66ed05199b301e7c25f42f979afdc ("printk: use mutex lock to stop syslog_seq from going wild"), so I'd recommend reverting that one too (albeit there's no direct collision between the two). Signed-off-by: Jan Beulich Acked-by: Kay Sievers Cc: Yuanhan Liu Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 51 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index ff05361962e1..cdfba44fedf0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -862,26 +862,49 @@ static int syslog_print(char __user *buf, int size) { char *text; struct log *msg; - int len; + int len = 0; text = kmalloc(LOG_LINE_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - raw_spin_lock_irq(&logbuf_lock); - if (syslog_seq < log_first_seq) { - /* messages are gone, move to first one */ - syslog_seq = log_first_seq; - syslog_idx = log_first_idx; - } - msg = log_from_idx(syslog_idx); - len = msg_print_text(msg, true, text, LOG_LINE_MAX); - syslog_idx = log_next(syslog_idx); - syslog_seq++; - raw_spin_unlock_irq(&logbuf_lock); + while (size > 0) { + size_t n; + + raw_spin_lock_irq(&logbuf_lock); + if (syslog_seq < log_first_seq) { + /* messages are gone, move to first one */ + syslog_seq = log_first_seq; + syslog_idx = log_first_idx; + } + if (syslog_seq == log_next_seq) { + raw_spin_unlock_irq(&logbuf_lock); + break; + } + msg = log_from_idx(syslog_idx); + n = msg_print_text(msg, true, text, LOG_LINE_MAX); + if (n <= size) { + syslog_idx = log_next(syslog_idx); + syslog_seq++; + } else + n = 0; + raw_spin_unlock_irq(&logbuf_lock); + + if (!n) + break; - if (len > 0 && copy_to_user(buf, text, len)) - len = -EFAULT; + len += n; + size -= n; + buf += n; + n = copy_to_user(buf - n, text, n); + + if (n) { + len -= n; + if (!len) + len = -EFAULT; + break; + } + } kfree(text); return len; -- cgit v1.2.3 From 084681d14e429cb6192262ac7437f00e2c02f26a Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 28 Jun 2012 09:38:53 +0200 Subject: printk: flush continuation lines immediately to console Continuation lines are buffered internally, intended to merge the chunked printk()s into a single record, and to isolate potentially racy continuation users from usual terminated line users. This though, has the effect that partial lines are not printed to the console in the moment they are emitted. In case the kernel crashes in the meantime, the potentially interesting printed information would never reach the consoles. Here we share the continuation buffer with the console copy logic, and partial lines are always immediately flushed to the available consoles. They are still buffered internally to improve the readability and integrity of the messages and minimize the amount of needed record headers to store. Signed-off-by: Kay Sievers Tested-by: Steven Rostedt Acked-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 244 ++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 176 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index cdfba44fedf0..fbf4d0b22a1d 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -193,12 +193,19 @@ static int console_may_schedule; * separated by ',', and find the message after the ';' character. */ +enum log_flags { + LOG_DEFAULT = 0, + LOG_NOCONS = 1, /* already flushed, do not print to console */ +}; + struct log { u64 ts_nsec; /* timestamp in nanoseconds */ u16 len; /* length of entire record */ u16 text_len; /* length of text buffer */ u16 dict_len; /* length of dictionary buffer */ - u16 level; /* syslog level + facility */ + u8 facility; /* syslog facility */ + u8 flags:5; /* internal record flags */ + u8 level:3; /* syslog level */ }; /* @@ -286,6 +293,7 @@ static u32 log_next(u32 idx) /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, const char *dict, u16 dict_len, const char *text, u16 text_len) { @@ -329,8 +337,13 @@ static void log_store(int facility, int level, msg->text_len = text_len; memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; - msg->level = (facility << 3) | (level & 7); - msg->ts_nsec = local_clock(); + msg->facility = facility; + msg->level = level & 7; + msg->flags = flags & 0x1f; + if (ts_nsec > 0) + msg->ts_nsec = ts_nsec; + else + msg->ts_nsec = local_clock(); memset(log_dict(msg) + dict_len, 0, pad_len); msg->len = sizeof(struct log) + text_len + dict_len + pad_len; @@ -446,7 +459,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, ts_usec = msg->ts_nsec; do_div(ts_usec, 1000); len = sprintf(user->buf, "%u,%llu,%llu;", - msg->level, user->seq, ts_usec); + (msg->facility << 3) | msg->level, user->seq, ts_usec); /* escape non-printable characters */ for (i = 0; i < msg->text_len; i++) { @@ -787,6 +800,21 @@ static bool printk_time; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static size_t print_time(u64 ts, char *buf) +{ + unsigned long rem_nsec; + + if (!printk_time) + return 0; + + if (!buf) + return 15; + + rem_nsec = do_div(ts, 1000000000); + return sprintf(buf, "[%5lu.%06lu] ", + (unsigned long)ts, rem_nsec / 1000); +} + static size_t print_prefix(const struct log *msg, bool syslog, char *buf) { size_t len = 0; @@ -803,18 +831,7 @@ static size_t print_prefix(const struct log *msg, bool syslog, char *buf) } } - if (printk_time) { - if (buf) { - unsigned long long ts = msg->ts_nsec; - unsigned long rem_nsec = do_div(ts, 1000000000); - - len += sprintf(buf + len, "[%5lu.%06lu] ", - (unsigned long) ts, rem_nsec / 1000); - } else { - len += 15; - } - } - + len += print_time(msg->ts_nsec, buf ? buf + len : NULL); return len; } @@ -1294,15 +1311,92 @@ static inline void printk_delay(void) } } +/* + * Continuation lines are buffered, and not committed to the record buffer + * until the line is complete, or a race forces it. The line fragments + * though, are printed immediately to the consoles to ensure everything has + * reached the console in case of a kernel crash. + */ +static struct cont { + char buf[LOG_LINE_MAX]; + size_t len; /* length == 0 means unused buffer */ + size_t cons; /* bytes written to console */ + struct task_struct *owner; /* task of first print*/ + u64 ts_nsec; /* time of first print */ + u8 level; /* log level of first message */ + u8 facility; /* log level of first message */ + bool flushed:1; /* buffer sealed and committed */ +} cont; + +static void cont_flush(void) +{ + if (cont.flushed) + return; + if (cont.len == 0) + return; + + log_store(cont.facility, cont.level, LOG_NOCONS, cont.ts_nsec, + NULL, 0, cont.buf, cont.len); + + cont.flushed = true; +} + +static bool cont_add(int facility, int level, const char *text, size_t len) +{ + if (cont.len && cont.flushed) + return false; + + if (cont.len + len > sizeof(cont.buf)) { + cont_flush(); + return false; + } + + if (!cont.len) { + cont.facility = facility; + cont.level = level; + cont.owner = current; + cont.ts_nsec = local_clock(); + cont.cons = 0; + cont.flushed = false; + } + + memcpy(cont.buf + cont.len, text, len); + cont.len += len; + return true; +} + +static size_t cont_print_text(char *text, size_t size) +{ + size_t textlen = 0; + size_t len; + + if (cont.cons == 0) { + textlen += print_time(cont.ts_nsec, text); + size -= textlen; + } + + len = cont.len - cont.cons; + if (len > 0) { + if (len+1 > size) + len = size-1; + memcpy(text + textlen, cont.buf + cont.cons, len); + textlen += len; + cont.cons = cont.len; + } + + if (cont.flushed) { + text[textlen++] = '\n'; + /* got everything, release buffer */ + cont.len = 0; + } + return textlen; +} + asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { static int recursion_bug; - static char cont_buf[LOG_LINE_MAX]; - static size_t cont_len; - static int cont_level; - static struct task_struct *cont_task; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len; @@ -1348,7 +1442,8 @@ asmlinkage int vprintk_emit(int facility, int level, recursion_bug = 0; printed_len += strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, NULL, 0, recursion_msg, printed_len); + log_store(0, 2, LOG_DEFAULT, 0, + NULL, 0, recursion_msg, printed_len); } /* @@ -1386,55 +1481,38 @@ asmlinkage int vprintk_emit(int facility, int level, } if (!newline) { - if (cont_len && (prefix || cont_task != current)) { - /* - * Flush earlier buffer, which is either from a - * different thread, or when we got a new prefix. - */ - log_store(facility, cont_level, NULL, 0, cont_buf, cont_len); - cont_len = 0; - } - - if (!cont_len) { - cont_level = level; - cont_task = current; - } + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or another task also prints continuation lines. + */ + if (cont.len && (prefix || cont.owner != current)) + cont_flush(); - /* buffer or append to earlier buffer from the same thread */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; + /* buffer line if possible, otherwise store it right away */ + if (!cont_add(facility, level, text, text_len)) + log_store(facility, level, LOG_DEFAULT, 0, + dict, dictlen, text, text_len); } else { - if (cont_len && cont_task == current) { - if (prefix) { - /* - * New prefix from the same thread; flush. We - * either got no earlier newline, or we race - * with an interrupt. - */ - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - } + bool stored = false; - /* append to the earlier buffer and flush */ - if (cont_len + text_len > sizeof(cont_buf)) - text_len = sizeof(cont_buf) - cont_len; - memcpy(cont_buf + cont_len, text, text_len); - cont_len += text_len; - log_store(facility, cont_level, - NULL, 0, cont_buf, cont_len); - cont_len = 0; - cont_task = NULL; - printed_len = cont_len; - } else { - /* ordinary single and terminated line */ - log_store(facility, level, - dict, dictlen, text, text_len); - printed_len = text_len; + /* + * Flush the conflicting buffer. An earlier newline was missing, + * or we race with a continuation line from an interrupt. + */ + if (cont.len && prefix && cont.owner == current) + cont_flush(); + + /* Merge with our buffer if possible; flush it in any case */ + if (cont.len && cont.owner == current) { + stored = cont_add(facility, level, text, text_len); + cont_flush(); } + + if (!stored) + log_store(facility, level, LOG_DEFAULT, 0, + dict, dictlen, text, text_len); } + printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. @@ -1521,11 +1599,18 @@ EXPORT_SYMBOL(printk); #else #define LOG_LINE_MAX 0 +static struct cont { + size_t len; + size_t cons; + u8 level; + bool flushed:1; +} cont; static struct log *log_from_idx(u32 idx) { return NULL; } static u32 log_next(u32 idx) { return 0; } static void call_console_drivers(int level, const char *text, size_t len) {} static size_t msg_print_text(const struct log *msg, bool syslog, char *buf, size_t size) { return 0; } +static size_t cont_print_text(char *text, size_t size) { return 0; } #endif /* CONFIG_PRINTK */ @@ -1817,6 +1902,7 @@ static u32 console_idx; */ void console_unlock(void) { + static char text[LOG_LINE_MAX]; static u64 seen_seq; unsigned long flags; bool wake_klogd = false; @@ -1829,10 +1915,23 @@ void console_unlock(void) console_may_schedule = 0; + /* flush buffered message fragment immediately to console */ + raw_spin_lock_irqsave(&logbuf_lock, flags); + if (cont.len && (cont.cons < cont.len || cont.flushed)) { + size_t len; + + len = cont_print_text(text, sizeof(text)); + raw_spin_unlock(&logbuf_lock); + stop_critical_timings(); + call_console_drivers(cont.level, text, len); + start_critical_timings(); + local_irq_restore(flags); + } else + raw_spin_unlock_irqrestore(&logbuf_lock, flags); + again: for (;;) { struct log *msg; - static char text[LOG_LINE_MAX]; size_t len; int level; @@ -1847,13 +1946,22 @@ again: console_seq = log_first_seq; console_idx = log_first_idx; } - +skip: if (console_seq == log_next_seq) break; msg = log_from_idx(console_idx); - level = msg->level & 7; + if (msg->flags & LOG_NOCONS) { + /* + * Skip record we have buffered and already printed + * directly to the console when we received it. + */ + console_idx = log_next(console_idx); + console_seq++; + goto skip; + } + level = msg->level; len = msg_print_text(msg, false, text, sizeof(text)); console_idx = log_next(console_idx); -- cgit v1.2.3 From d36208227d03c44c0a74cd702cc94528162e1703 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Jun 2012 11:40:11 -0400 Subject: printk: Optimize if statement logic where newline exists In reviewing Kay's fix up patch: "printk: Have printk() never buffer its data", I found two if statements that could be combined and optimized. Put together the two 'cont.len && cont.owner == current' if statements into a single one, and check if we need to call cont_add(). This also removes the unneeded double cont_flush() calls. Link: http://lkml.kernel.org/r/1340869133.876.10.camel@mop Signed-off-by: Steven Rostedt Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/printk.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index fbf4d0b22a1d..5ae6b09e3805 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1496,15 +1496,14 @@ asmlinkage int vprintk_emit(int facility, int level, bool stored = false; /* - * Flush the conflicting buffer. An earlier newline was missing, - * or we race with a continuation line from an interrupt. + * If an earlier newline was missing and it was the same task, + * either merge it with the current buffer and flush, or if + * there was a race with interrupts (prefix == true) then just + * flush it out and store this line separately. */ - if (cont.len && prefix && cont.owner == current) - cont_flush(); - - /* Merge with our buffer if possible; flush it in any case */ if (cont.len && cont.owner == current) { - stored = cont_add(facility, level, text, text_len); + if (!prefix) + stored = cont_add(facility, level, text, text_len); cont_flush(); } -- cgit v1.2.3 From 4f0f4af59cb07bcf44d3c07a9e8c26df54d9fff8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 30 Jun 2012 15:37:24 -0700 Subject: printk.c: fix kernel-doc warnings Fix kernel-doc warnings in printk.c: use correct parameter name. Warning(kernel/printk.c:2429): No description found for parameter 'buf' Warning(kernel/printk.c:2429): Excess function parameter 'line' description in 'kmsg_dump_get_buffer' Signed-off-by: Randy Dunlap Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 5ae6b09e3805..dba18211685e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -2538,7 +2538,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * kmsg_dump_get_buffer - copy kmsg log lines * @dumper: registered kmsg dumper * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to + * @buf: buffer to copy the line to * @size: maximum size of the buffer * @len: length of line placed into buffer * -- cgit v1.2.3