From 8d9df9f0844ed87541453a3ef91bfc9f487053b7 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Mon, 16 Aug 2010 09:54:28 +0200 Subject: workqueue: free rescuer on destroy_workqueue wq->rescuer is not freed when wq is destroyed, leads a memory leak then. This patch also remove a redundant line. Signed-off-by: Xiaotian Feng Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2994a0e3a61c..1001b6e3fcbd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2782,7 +2782,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (IS_ERR(rescuer->task)) goto err; - wq->rescuer = rescuer; rescuer->task->flags |= PF_THREAD_BOUND; wake_up_process(rescuer->task); } @@ -2848,6 +2847,7 @@ void destroy_workqueue(struct workqueue_struct *wq) if (wq->flags & WQ_RESCUER) { kthread_stop(wq->rescuer->task); free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); } free_cwqs(wq); -- cgit v1.2.3 From 1495cc9df4e81f5a8fa9b0b8f1034b14d24b7d8c Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 17 Aug 2010 21:15:46 -0700 Subject: Input: sysrq - drop tty argument from sysrq ops handlers Noone is using tty argument so let's get rid of it. Acked-by: Alan Cox Acked-by: Jason Wessel Acked-by: Greg Kroah-Hartman Signed-off-by: Dmitry Torokhov --- arch/arm/kernel/etm.c | 2 +- arch/powerpc/xmon/xmon.c | 5 ++--- arch/sparc/kernel/process_64.c | 2 +- drivers/char/sysrq.c | 42 ++++++++++++++++++++--------------------- drivers/gpu/drm/drm_fb_helper.c | 2 +- drivers/net/ibm_newemac/debug.c | 2 +- include/linux/sysrq.h | 6 +++++- kernel/debug/debug_core.c | 2 +- kernel/power/poweroff.c | 2 +- 9 files changed, 34 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/etm.c b/arch/arm/kernel/etm.c index 56418f98cd01..33c7077174db 100644 --- a/arch/arm/kernel/etm.c +++ b/arch/arm/kernel/etm.c @@ -230,7 +230,7 @@ static void etm_dump(void) etb_lock(t); } -static void sysrq_etm_dump(int key, struct tty_struct *tty) +static void sysrq_etm_dump(int key) { dev_dbg(tracer.dev, "Dumping ETB buffer\n"); etm_dump(); diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 0554445200bf..d17d04cfb2cd 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2880,15 +2880,14 @@ static void xmon_init(int enable) } #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_xmon(int key, struct tty_struct *tty) +static void sysrq_handle_xmon(int key) { /* ensure xmon is enabled */ xmon_init(1); debugger(get_irq_regs()); } -static struct sysrq_key_op sysrq_xmon_op = -{ +static struct sysrq_key_op sysrq_xmon_op = { .handler = sysrq_handle_xmon, .help_msg = "Xmon", .action_msg = "Entering xmon", diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index dbe81a368b45..25b01b43b40d 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -303,7 +303,7 @@ void arch_trigger_all_cpu_backtrace(void) #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_globreg(int key, struct tty_struct *tty) +static void sysrq_handle_globreg(int key) { arch_trigger_all_cpu_backtrace(); } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 878ac0c2cc68..a892a3c249dd 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -76,7 +76,7 @@ static int __init sysrq_always_enabled_setup(char *str) __setup("sysrq_always_enabled", sysrq_always_enabled_setup); -static void sysrq_handle_loglevel(int key, struct tty_struct *tty) +static void sysrq_handle_loglevel(int key) { int i; @@ -93,7 +93,7 @@ static struct sysrq_key_op sysrq_loglevel_op = { }; #ifdef CONFIG_VT -static void sysrq_handle_SAK(int key, struct tty_struct *tty) +static void sysrq_handle_SAK(int key) { struct work_struct *SAK_work = &vc_cons[fg_console].SAK_work; schedule_work(SAK_work); @@ -109,7 +109,7 @@ static struct sysrq_key_op sysrq_SAK_op = { #endif #ifdef CONFIG_VT -static void sysrq_handle_unraw(int key, struct tty_struct *tty) +static void sysrq_handle_unraw(int key) { struct kbd_struct *kbd = &kbd_table[fg_console]; @@ -126,7 +126,7 @@ static struct sysrq_key_op sysrq_unraw_op = { #define sysrq_unraw_op (*(struct sysrq_key_op *)NULL) #endif /* CONFIG_VT */ -static void sysrq_handle_crash(int key, struct tty_struct *tty) +static void sysrq_handle_crash(int key) { char *killer = NULL; @@ -141,7 +141,7 @@ static struct sysrq_key_op sysrq_crash_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; -static void sysrq_handle_reboot(int key, struct tty_struct *tty) +static void sysrq_handle_reboot(int key) { lockdep_off(); local_irq_enable(); @@ -154,7 +154,7 @@ static struct sysrq_key_op sysrq_reboot_op = { .enable_mask = SYSRQ_ENABLE_BOOT, }; -static void sysrq_handle_sync(int key, struct tty_struct *tty) +static void sysrq_handle_sync(int key) { emergency_sync(); } @@ -165,7 +165,7 @@ static struct sysrq_key_op sysrq_sync_op = { .enable_mask = SYSRQ_ENABLE_SYNC, }; -static void sysrq_handle_show_timers(int key, struct tty_struct *tty) +static void sysrq_handle_show_timers(int key) { sysrq_timer_list_show(); } @@ -176,7 +176,7 @@ static struct sysrq_key_op sysrq_show_timers_op = { .action_msg = "Show clockevent devices & pending hrtimers (no others)", }; -static void sysrq_handle_mountro(int key, struct tty_struct *tty) +static void sysrq_handle_mountro(int key) { emergency_remount(); } @@ -188,7 +188,7 @@ static struct sysrq_key_op sysrq_mountro_op = { }; #ifdef CONFIG_LOCKDEP -static void sysrq_handle_showlocks(int key, struct tty_struct *tty) +static void sysrq_handle_showlocks(int key) { debug_show_all_locks(); } @@ -226,7 +226,7 @@ static void sysrq_showregs_othercpus(struct work_struct *dummy) static DECLARE_WORK(sysrq_showallcpus, sysrq_showregs_othercpus); -static void sysrq_handle_showallcpus(int key, struct tty_struct *tty) +static void sysrq_handle_showallcpus(int key) { /* * Fall back to the workqueue based printing if the @@ -252,7 +252,7 @@ static struct sysrq_key_op sysrq_showallcpus_op = { }; #endif -static void sysrq_handle_showregs(int key, struct tty_struct *tty) +static void sysrq_handle_showregs(int key) { struct pt_regs *regs = get_irq_regs(); if (regs) @@ -266,7 +266,7 @@ static struct sysrq_key_op sysrq_showregs_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; -static void sysrq_handle_showstate(int key, struct tty_struct *tty) +static void sysrq_handle_showstate(int key) { show_state(); } @@ -277,7 +277,7 @@ static struct sysrq_key_op sysrq_showstate_op = { .enable_mask = SYSRQ_ENABLE_DUMP, }; -static void sysrq_handle_showstate_blocked(int key, struct tty_struct *tty) +static void sysrq_handle_showstate_blocked(int key) { show_state_filter(TASK_UNINTERRUPTIBLE); } @@ -291,7 +291,7 @@ static struct sysrq_key_op sysrq_showstate_blocked_op = { #ifdef CONFIG_TRACING #include -static void sysrq_ftrace_dump(int key, struct tty_struct *tty) +static void sysrq_ftrace_dump(int key) { ftrace_dump(DUMP_ALL); } @@ -305,7 +305,7 @@ static struct sysrq_key_op sysrq_ftrace_dump_op = { #define sysrq_ftrace_dump_op (*(struct sysrq_key_op *)NULL) #endif -static void sysrq_handle_showmem(int key, struct tty_struct *tty) +static void sysrq_handle_showmem(int key) { show_mem(); } @@ -330,7 +330,7 @@ static void send_sig_all(int sig) } } -static void sysrq_handle_term(int key, struct tty_struct *tty) +static void sysrq_handle_term(int key) { send_sig_all(SIGTERM); console_loglevel = 8; @@ -349,7 +349,7 @@ static void moom_callback(struct work_struct *ignored) static DECLARE_WORK(moom_work, moom_callback); -static void sysrq_handle_moom(int key, struct tty_struct *tty) +static void sysrq_handle_moom(int key) { schedule_work(&moom_work); } @@ -361,7 +361,7 @@ static struct sysrq_key_op sysrq_moom_op = { }; #ifdef CONFIG_BLOCK -static void sysrq_handle_thaw(int key, struct tty_struct *tty) +static void sysrq_handle_thaw(int key) { emergency_thaw_all(); } @@ -373,7 +373,7 @@ static struct sysrq_key_op sysrq_thaw_op = { }; #endif -static void sysrq_handle_kill(int key, struct tty_struct *tty) +static void sysrq_handle_kill(int key) { send_sig_all(SIGKILL); console_loglevel = 8; @@ -385,7 +385,7 @@ static struct sysrq_key_op sysrq_kill_op = { .enable_mask = SYSRQ_ENABLE_SIGNAL, }; -static void sysrq_handle_unrt(int key, struct tty_struct *tty) +static void sysrq_handle_unrt(int key) { normalize_rt_tasks(); } @@ -520,7 +520,7 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) if (!check_mask || sysrq_on_mask(op_p->enable_mask)) { printk("%s\n", op_p->action_msg); console_loglevel = orig_log_level; - op_p->handler(key, tty); + op_p->handler(key); } else { printk("This sysrq operation is disabled.\n"); } diff --git a/drivers/gpu/drm/drm_fb_helper.c b/drivers/gpu/drm/drm_fb_helper.c index de82e201d682..5efd6d6742ec 100644 --- a/drivers/gpu/drm/drm_fb_helper.c +++ b/drivers/gpu/drm/drm_fb_helper.c @@ -369,7 +369,7 @@ static void drm_fb_helper_restore_work_fn(struct work_struct *ignored) } static DECLARE_WORK(drm_fb_helper_restore_work, drm_fb_helper_restore_work_fn); -static void drm_fb_helper_sysrq(int dummy1, struct tty_struct *dummy3) +static void drm_fb_helper_sysrq(int dummy1) { schedule_work(&drm_fb_helper_restore_work); } diff --git a/drivers/net/ibm_newemac/debug.c b/drivers/net/ibm_newemac/debug.c index 3995fafc1e08..8c6c1e2a8750 100644 --- a/drivers/net/ibm_newemac/debug.c +++ b/drivers/net/ibm_newemac/debug.c @@ -238,7 +238,7 @@ void emac_dbg_dump_all(void) } #if defined(CONFIG_MAGIC_SYSRQ) -static void emac_sysrq_handler(int key, struct tty_struct *tty) +static void emac_sysrq_handler(int key) { emac_dbg_dump_all(); } diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 609e8ca5f534..4ee650315119 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -31,7 +31,7 @@ struct tty_struct; #define SYSRQ_ENABLE_RTNICE 0x0100 struct sysrq_key_op { - void (*handler)(int, struct tty_struct *); + void (*handler)(int); char *help_msg; char *action_msg; int enable_mask; @@ -58,6 +58,10 @@ static inline void handle_sysrq(int key, struct tty_struct *tty) { } +static inline void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); +{ +} + static inline int register_sysrq_key(int key, struct sysrq_key_op *op) { return -EINVAL; diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 3c2d4972d235..de407c78178d 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -741,7 +741,7 @@ static struct console kgdbcons = { }; #ifdef CONFIG_MAGIC_SYSRQ -static void sysrq_handle_dbg(int key, struct tty_struct *tty) +static void sysrq_handle_dbg(int key) { if (!dbg_io_ops) { printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index e8b337006276..d52359374e85 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -24,7 +24,7 @@ static void do_poweroff(struct work_struct *dummy) static DECLARE_WORK(poweroff_work, do_poweroff); -static void handle_poweroff(int key, struct tty_struct *tty) +static void handle_poweroff(int key) { /* run sysrq poweroff on boot cpu */ schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); -- cgit v1.2.3 From 861d034ee814917a83bd5de4b26e3b8336ddeeb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Aug 2010 13:31:43 +0200 Subject: sched: Fix rq->clock synchronization when migrating tasks sched_fork() -- we do task placement in ->task_fork_fair() ensure we update_rq_clock() so we work with current time. We leave the vruntime in relative state, so the time delay until wake_up_new_task() doesn't matter. wake_up_new_task() -- Since task_fork_fair() left p->vruntime in relative state we can safely migrate, the activate_task() on the remote rq will call update_rq_clock() and causes the clock to be synced (enough). Tested-by: Jack Daniel Tested-by: Philby John Signed-off-by: Peter Zijlstra LKML-Reference: <1281002322.1923.1708.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 806d1b227a21..ab661ebc4895 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3752,6 +3752,8 @@ static void task_fork_fair(struct task_struct *p) raw_spin_lock_irqsave(&rq->lock, flags); + update_rq_clock(rq); + if (unlikely(task_cpu(p) != this_cpu)) __set_task_cpu(p, this_cpu); -- cgit v1.2.3 From f335397d177c906256ee1bba28e8c49e8ec63817 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Tue, 17 Aug 2010 21:15:47 -0700 Subject: Input: sysrq - drop tty argument form handle_sysrq() Sysrq operations do not accept tty argument anymore so no need to pass it to us. [Stephen Rothwell : fix build breakage in drm code caused by sysrq using bool but not including linux/types.h] [Sachin Sant : fix build breakage in s390 keyboadr driver] Acked-by: Alan Cox Acked-by: Jason Wessel Acked-by: Greg Kroah-Hartman Signed-off-by: Dmitry Torokhov --- arch/ia64/hp/sim/simserial.c | 2 +- arch/um/drivers/mconsole_kern.c | 2 +- drivers/char/hangcheck-timer.c | 2 +- drivers/char/hvc_console.c | 2 +- drivers/char/hvsi.c | 2 +- drivers/char/sysrq.c | 11 +++++------ drivers/s390/char/ctrlchar.c | 4 +--- drivers/s390/char/keyboard.c | 2 +- drivers/serial/sn_console.c | 2 +- drivers/usb/serial/generic.c | 2 +- drivers/xen/manage.c | 2 +- include/linux/serial_core.h | 2 +- include/linux/sysrq.h | 12 +++++------- kernel/debug/kdb/kdb_main.c | 2 +- 14 files changed, 22 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c index 2bef5261d96d..1e8d71ad93ef 100644 --- a/arch/ia64/hp/sim/simserial.c +++ b/arch/ia64/hp/sim/simserial.c @@ -149,7 +149,7 @@ static void receive_chars(struct tty_struct *tty) ch = ia64_ssc(0, 0, 0, 0, SSC_GETCHAR); while (!ch); - handle_sysrq(ch, NULL); + handle_sysrq(ch); } #endif seen_esc = 0; diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index de317d0c3294..ebc680717e59 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -690,7 +690,7 @@ static void with_console(struct mc_request *req, void (*proc)(void *), static void sysrq_proc(void *arg) { char *op = arg; - handle_sysrq(*op, NULL); + handle_sysrq(*op); } void mconsole_sysrq(struct mc_request *req) diff --git a/drivers/char/hangcheck-timer.c b/drivers/char/hangcheck-timer.c index e0249722d25f..f953c96efc86 100644 --- a/drivers/char/hangcheck-timer.c +++ b/drivers/char/hangcheck-timer.c @@ -159,7 +159,7 @@ static void hangcheck_fire(unsigned long data) if (hangcheck_dump_tasks) { printk(KERN_CRIT "Hangcheck: Task state:\n"); #ifdef CONFIG_MAGIC_SYSRQ - handle_sysrq('t', NULL); + handle_sysrq('t'); #endif /* CONFIG_MAGIC_SYSRQ */ } if (hangcheck_reboot) { diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c index fa27d1676ee5..3afd62e856eb 100644 --- a/drivers/char/hvc_console.c +++ b/drivers/char/hvc_console.c @@ -651,7 +651,7 @@ int hvc_poll(struct hvc_struct *hp) if (sysrq_pressed) continue; } else if (sysrq_pressed) { - handle_sysrq(buf[i], tty); + handle_sysrq(buf[i]); sysrq_pressed = 0; continue; } diff --git a/drivers/char/hvsi.c b/drivers/char/hvsi.c index 1f4b6de65a2d..a2bc885ce60a 100644 --- a/drivers/char/hvsi.c +++ b/drivers/char/hvsi.c @@ -403,7 +403,7 @@ static void hvsi_insert_chars(struct hvsi_struct *hp, const char *buf, int len) hp->sysrq = 1; continue; } else if (hp->sysrq) { - handle_sysrq(c, hp->tty); + handle_sysrq(c); hp->sysrq = 0; continue; } diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index a892a3c249dd..ef31bb81e843 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -493,7 +492,7 @@ static void __sysrq_put_key_op(int key, struct sysrq_key_op *op_p) sysrq_key_table[i] = op_p; } -void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) +void __handle_sysrq(int key, bool check_mask) { struct sysrq_key_op *op_p; int orig_log_level; @@ -545,10 +544,10 @@ void __handle_sysrq(int key, struct tty_struct *tty, int check_mask) spin_unlock_irqrestore(&sysrq_key_table_lock, flags); } -void handle_sysrq(int key, struct tty_struct *tty) +void handle_sysrq(int key) { if (sysrq_on()) - __handle_sysrq(key, tty, 1); + __handle_sysrq(key, true); } EXPORT_SYMBOL(handle_sysrq); @@ -597,7 +596,7 @@ static bool sysrq_filter(struct input_handle *handle, unsigned int type, default: if (sysrq_down && value && value != 2) - __handle_sysrq(sysrq_xlate[code], NULL, 1); + __handle_sysrq(sysrq_xlate[code], true); break; } @@ -765,7 +764,7 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, if (get_user(c, buf)) return -EFAULT; - __handle_sysrq(c, NULL, 0); + __handle_sysrq(c, false); } return count; diff --git a/drivers/s390/char/ctrlchar.c b/drivers/s390/char/ctrlchar.c index c6cbcb3f925e..0e9a309b9669 100644 --- a/drivers/s390/char/ctrlchar.c +++ b/drivers/s390/char/ctrlchar.c @@ -16,12 +16,11 @@ #ifdef CONFIG_MAGIC_SYSRQ static int ctrlchar_sysrq_key; -static struct tty_struct *sysrq_tty; static void ctrlchar_handle_sysrq(struct work_struct *work) { - handle_sysrq(ctrlchar_sysrq_key, sysrq_tty); + handle_sysrq(ctrlchar_sysrq_key); } static DECLARE_WORK(ctrlchar_work, ctrlchar_handle_sysrq); @@ -54,7 +53,6 @@ ctrlchar_handle(const unsigned char *buf, int len, struct tty_struct *tty) /* racy */ if (len == 3 && buf[1] == '-') { ctrlchar_sysrq_key = buf[2]; - sysrq_tty = tty; schedule_work(&ctrlchar_work); return CTRLCHAR_SYSRQ; } diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c index 18d9a497863b..8cd58e412b5e 100644 --- a/drivers/s390/char/keyboard.c +++ b/drivers/s390/char/keyboard.c @@ -305,7 +305,7 @@ kbd_keycode(struct kbd_data *kbd, unsigned int keycode) if (kbd->sysrq) { if (kbd->sysrq == K(KT_LATIN, '-')) { kbd->sysrq = 0; - handle_sysrq(value, kbd->tty); + handle_sysrq(value); return; } if (value == '-') { diff --git a/drivers/serial/sn_console.c b/drivers/serial/sn_console.c index 7e5e5efea4e2..cff9a306660f 100644 --- a/drivers/serial/sn_console.c +++ b/drivers/serial/sn_console.c @@ -492,7 +492,7 @@ sn_receive_chars(struct sn_cons_port *port, unsigned long flags) sysrq_requested = 0; if (ch && time_before(jiffies, sysrq_timeout)) { spin_unlock_irqrestore(&port->sc_port.lock, flags); - handle_sysrq(ch, NULL); + handle_sysrq(ch); spin_lock_irqsave(&port->sc_port.lock, flags); /* ignore actual sysrq command char */ continue; diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index ca92f67747cc..1e846cc3c7a4 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -453,7 +453,7 @@ int usb_serial_handle_sysrq_char(struct tty_struct *tty, { if (port->sysrq && port->port.console) { if (ch && time_before(jiffies, port->sysrq)) { - handle_sysrq(ch, tty); + handle_sysrq(ch); port->sysrq = 0; return 1; } diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 1799bd890315..ef9c7db52077 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -237,7 +237,7 @@ static void sysrq_handler(struct xenbus_watch *watch, const char **vec, goto again; if (sysrq_key != '\0') - handle_sysrq(sysrq_key, NULL); + handle_sysrq(sysrq_key); } static struct xenbus_watch sysrq_watch = { diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 3c2ad99fed34..64458a9a8938 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -465,7 +465,7 @@ uart_handle_sysrq_char(struct uart_port *port, unsigned int ch) #ifdef SUPPORT_SYSRQ if (port->sysrq) { if (ch && time_before(jiffies, port->sysrq)) { - handle_sysrq(ch, port->state->port.tty); + handle_sysrq(ch); port->sysrq = 0; return 1; } diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 4ee650315119..387fa7d05c98 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -15,9 +15,7 @@ #define _LINUX_SYSRQ_H #include - -struct pt_regs; -struct tty_struct; +#include /* Possible values of bitmask for enabling sysrq functions */ /* 0x0001 is reserved for enable everything */ @@ -44,8 +42,8 @@ struct sysrq_key_op { * are available -- else NULL's). */ -void handle_sysrq(int key, struct tty_struct *tty); -void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); +void handle_sysrq(int key); +void __handle_sysrq(int key, bool check_mask); int register_sysrq_key(int key, struct sysrq_key_op *op); int unregister_sysrq_key(int key, struct sysrq_key_op *op); struct sysrq_key_op *__sysrq_get_key_op(int key); @@ -54,11 +52,11 @@ int sysrq_toggle_support(int enable_mask); #else -static inline void handle_sysrq(int key, struct tty_struct *tty) +static inline void handle_sysrq(int key) { } -static inline void __handle_sysrq(int key, struct tty_struct *tty, int check_mask); +static inline void __handle_sysrq(int key, bool check_mask) { } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 28b844118bbd..caf057a3de0e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1929,7 +1929,7 @@ static int kdb_sr(int argc, const char **argv) if (argc != 1) return KDB_ARGCOUNT; kdb_trap_printk++; - __handle_sysrq(*argv[1], NULL, 0); + __handle_sysrq(*argv[1], false); kdb_trap_printk--; return 0; -- cgit v1.2.3 From 06bd6ebffae36d3b105677598c48e8bd0a10b205 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 22 Aug 2010 23:19:42 +0900 Subject: workqueue: annotate lock context change Some of internal functions called within gcwq->lock context releases and regrabs the lock but were missing proper annotations. Add it. Signed-off-by: Namhyung Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1001b6e3fcbd..7415f27a8aa7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1485,6 +1485,8 @@ static void gcwq_mayday_timeout(unsigned long __gcwq) * otherwise. */ static bool maybe_create_worker(struct global_cwq *gcwq) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { if (!need_to_create_worker(gcwq)) return false; @@ -1722,6 +1724,8 @@ static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) * spin_lock_irq(gcwq->lock) which is released and regrabbed. */ static void process_one_work(struct worker *worker, struct work_struct *work) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { struct cpu_workqueue_struct *cwq = get_work_cwq(work); struct global_cwq *gcwq = cwq->gcwq; @@ -3230,6 +3234,8 @@ static int __cpuinit trustee_thread(void *__gcwq) * multiple times. To be used by cpu_callback. */ static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) { if (!(gcwq->trustee_state == state || gcwq->trustee_state == TRUSTEE_DONE)) { -- cgit v1.2.3 From 972fa1c5316d18c8297123e08e9b6930ca34f888 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sun, 22 Aug 2010 23:19:43 +0900 Subject: workqueue: mark lock acquisition on worker_maybe_bind_and_lock() worker_maybe_bind_and_lock() actually grabs gcwq->lock but was missing proper annotation. Add it. So this patch will remove following sparse warnings: kernel/workqueue.c:1214:13: warning: context imbalance in 'worker_maybe_bind_and_lock' - wrong count at exit arch/x86/include/asm/irqflags.h:44:9: warning: context imbalance in 'worker_rebind_fn' - unexpected unlock kernel/workqueue.c:1991:17: warning: context imbalance in 'rescuer_thread' - unexpected unlock Signed-off-by: Namhyung Kim Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7415f27a8aa7..cc3456f96c56 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1212,6 +1212,7 @@ static void worker_leave_idle(struct worker *worker) * bound), %false if offline. */ static bool worker_maybe_bind_and_lock(struct worker *worker) +__acquires(&gcwq->lock) { struct global_cwq *gcwq = worker->gcwq; struct task_struct *task = worker->task; -- cgit v1.2.3 From e41e704bc4f49057fc68b643108366e6e6781aa3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 24 Aug 2010 14:22:47 +0200 Subject: workqueue: improve destroy_workqueue() debuggability Now that the worklist is global, having works pending after wq destruction can easily lead to oops and destroy_workqueue() have several BUG_ON()s to catch these cases. Unfortunately, BUG_ON() doesn't tell much about how the work became pending after the final flush_workqueue(). This patch adds WQ_DYING which is set before the final flush begins. If a work is requested to be queued on a dying workqueue, WARN_ON_ONCE() is triggered and the request is ignored. This clearly indicates which caller is trying to queue a work on a dying workqueue and keeps the system working in most cases. Locking rule comment is updated such that the 'I' rule includes modifying the field from destruction path. Signed-off-by: Tejun Heo --- include/linux/workqueue.h | 2 ++ kernel/workqueue.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 4f9d277bcd9a..c959666eafca 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -241,6 +241,8 @@ enum { WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu instensive workqueue */ + WQ_DYING = 1 << 6, /* internal: workqueue is dying */ + WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cc3456f96c56..362b50d092e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -87,7 +87,8 @@ enum { /* * Structure fields follow one of the following exclusion rules. * - * I: Set during initialization and read-only afterwards. + * I: Modifiable by initialization/destruction paths and read-only for + * everyone else. * * P: Preemption protected. Disabling preemption is enough and should * only be modified and accessed from the local cpu. @@ -944,6 +945,9 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); + if (WARN_ON_ONCE(wq->flags & WQ_DYING)) + return; + /* determine gcwq to use */ if (!(wq->flags & WQ_UNBOUND)) { struct global_cwq *last_gcwq; @@ -2828,6 +2832,7 @@ void destroy_workqueue(struct workqueue_struct *wq) { unsigned int cpu; + wq->flags |= WQ_DYING; flush_workqueue(wq); /* -- cgit v1.2.3 From bac1e74dba9755128748b872a0f304dad4d198c6 Mon Sep 17 00:00:00 2001 From: David Alan Gilbert Date: Tue, 24 Aug 2010 20:22:18 +0200 Subject: PM QoS: Fix kzalloc() parameters swapped in pm_qos_power_open() sparse spotted that the kzalloc() in pm_qos_power_open() in the current Linus' git tree had its parameters swapped. Fix this. Signed-off-by: David Alan Gilbert Acked-by: mark gross Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 996a4dec5f96..9da439b419aa 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -348,7 +348,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; -- cgit v1.2.3 From 8a2e8e5dec7e29c56a46ba176c664ab6a3d04118 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 25 Aug 2010 10:33:56 +0200 Subject: workqueue: fix cwq->nr_active underflow cwq->nr_active is used to keep track of how many work items are active for the cpu workqueue, where 'active' is defined as either pending on global worklist or executing. This is used to implement the max_active limit and workqueue freezing. If a work item is queued after nr_active has already reached max_active, the work item doesn't increment nr_active and is put on the delayed queue and gets activated later as previous active work items retire. try_to_grab_pending() which is used in the cancellation path unconditionally decremented nr_active whether the work item being cancelled is currently active or delayed, so cancelling a delayed work item makes nr_active underflow. This breaks max_active enforcement and triggers BUG_ON() in destroy_workqueue() later on. This patch fixes this bug by adding a flag WORK_STRUCT_DELAYED, which is set while a work item in on the delayed list and making try_to_grab_pending() decrement nr_active iff the work item is currently active. The addition of the flag enlarges cwq alignment to 256 bytes which is getting a bit too large. It's scheduled to be reduced back to 128 bytes by merging WORK_STRUCT_PENDING and WORK_STRUCT_CWQ in the next devel cycle. Signed-off-by: Tejun Heo Reported-by: Johannes Berg --- include/linux/workqueue.h | 16 +++++++++------- kernel/workqueue.c | 30 ++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index c959666eafca..f11100f96482 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -25,18 +25,20 @@ typedef void (*work_func_t)(struct work_struct *work); enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ - WORK_STRUCT_CWQ_BIT = 1, /* data points to cwq */ - WORK_STRUCT_LINKED_BIT = 2, /* next work is linked to this one */ + WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ + WORK_STRUCT_CWQ_BIT = 2, /* data points to cwq */ + WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK - WORK_STRUCT_STATIC_BIT = 3, /* static initializer (debugobjects) */ - WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ + WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ + WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ #else - WORK_STRUCT_COLOR_SHIFT = 3, /* color for workqueue flushing */ + WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ #endif WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, + WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, WORK_STRUCT_CWQ = 1 << WORK_STRUCT_CWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -59,8 +61,8 @@ enum { /* * Reserve 7 bits off of cwq pointer w/ debugobjects turned - * off. This makes cwqs aligned to 128 bytes which isn't too - * excessive while allowing 15 workqueue flush colors. + * off. This makes cwqs aligned to 256 bytes and allows 15 + * workqueue flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 362b50d092e2..a2dccfca03ba 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -941,6 +941,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, struct global_cwq *gcwq; struct cpu_workqueue_struct *cwq; struct list_head *worklist; + unsigned int work_flags; unsigned long flags; debug_work_activate(work); @@ -990,14 +991,17 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, BUG_ON(!list_empty(&work->entry)); cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); if (likely(cwq->nr_active < cwq->max_active)) { cwq->nr_active++; worklist = gcwq_determine_ins_pos(gcwq, cwq); - } else + } else { + work_flags |= WORK_STRUCT_DELAYED; worklist = &cwq->delayed_works; + } - insert_work(cwq, work, worklist, work_color_to_flags(cwq->work_color)); + insert_work(cwq, work, worklist, work_flags); spin_unlock_irqrestore(&gcwq->lock, flags); } @@ -1666,6 +1670,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); move_linked_works(work, pos, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; } @@ -1673,6 +1678,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight * @cwq: cwq of interest * @color: color of work which left the queue + * @delayed: for a delayed work * * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its cwq and handle workqueue flushing. @@ -1680,19 +1686,22 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) * CONTEXT: * spin_lock_irq(gcwq->lock). */ -static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, + bool delayed) { /* ignore uncolored works */ if (color == WORK_NO_COLOR) return; cwq->nr_in_flight[color]--; - cwq->nr_active--; - if (!list_empty(&cwq->delayed_works)) { - /* one down, submit a delayed one */ - if (cwq->nr_active < cwq->max_active) - cwq_activate_first_delayed(cwq); + if (!delayed) { + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } } /* is flush in progress and are we at the flushing tip? */ @@ -1823,7 +1832,7 @@ __acquires(&gcwq->lock) hlist_del_init(&worker->hentry); worker->current_work = NULL; worker->current_cwq = NULL; - cwq_dec_nr_in_flight(cwq, work_color); + cwq_dec_nr_in_flight(cwq, work_color, false); } /** @@ -2388,7 +2397,8 @@ static int try_to_grab_pending(struct work_struct *work) debug_work_deactivate(work); list_del_init(&work->entry); cwq_dec_nr_in_flight(get_work_cwq(work), - get_work_color(work)); + get_work_color(work), + *work_data_bits(work) & WORK_STRUCT_DELAYED); ret = 1; } } -- cgit v1.2.3 From 25cc69ec34a563e943e85b3b68a79a8aac7f076d Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 26 Aug 2010 20:18:43 +0200 Subject: PM QoS: Fix inline documentation. Fix the pm_qos_add_request() kerneldoc comment that doesn't reflect the behavior of the function after the last PM QoS update. Signed-off-by: Saravana Kannan Acked-by: mark gross Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 9da439b419aa..b7e4c362361b 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -212,15 +212,17 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); /** * pm_qos_add_request - inserts new qos request into the list - * @pm_qos_class: identifies which list of qos request to us + * @dep: pointer to a preallocated handle + * @pm_qos_class: identifies which list of qos request to use * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters, and returns the pm_qos_request list - * element as a handle for use in updating and removal. Call needs to save - * this handle for later use. + * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * handle. Caller needs to save this handle for later use in updates and + * removal. */ + void pm_qos_add_request(struct pm_qos_request_list *dep, int pm_qos_class, s32 value) { -- cgit v1.2.3 From 477a3c33d1efa0342a74bd02da2e049191993e2c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 31 Aug 2010 10:54:35 +0200 Subject: workqueue: fix GCWQ_DISASSOCIATED initialization init_workqueues() incorrectly marks workqueues for all possible CPUs associated. Combined with mayday_mask initialization bug, this can make rescuers keep trying to bind to an offline gcwq indefinitely. Fix init_workqueues() such that only online CPUs have their gcwqs have GCWQ_DISASSOCIATED cleared. Signed-off-by: Tejun Heo Reported-by: CAI Qian --- kernel/workqueue.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a2dccfca03ba..c8183b235d16 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3558,8 +3558,7 @@ static int __init init_workqueues(void) spin_lock_init(&gcwq->lock); INIT_LIST_HEAD(&gcwq->worklist); gcwq->cpu = cpu; - if (cpu == WORK_CPU_UNBOUND) - gcwq->flags |= GCWQ_DISASSOCIATED; + gcwq->flags |= GCWQ_DISASSOCIATED; INIT_LIST_HEAD(&gcwq->idle_list); for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) @@ -3583,6 +3582,8 @@ static int __init init_workqueues(void) struct global_cwq *gcwq = get_gcwq(cpu); struct worker *worker; + if (cpu != WORK_CPU_UNBOUND) + gcwq->flags &= ~GCWQ_DISASSOCIATED; worker = create_worker(gcwq, true); BUG_ON(!worker); spin_lock_irq(&gcwq->lock); -- cgit v1.2.3 From 9c37547ab62f88aac3e1e3c2065b611f811de9b5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 31 Aug 2010 11:18:34 +0200 Subject: workqueue: use zalloc_cpumask_var() for gcwq->mayday_mask alloc_mayday_mask() was using alloc_cpumask_var() making gcwq->mayday_mask contain garbage after initialization on CONFIG_CPUMASK_OFFSTACK=y configurations. This combined with the previously fixed GCWQ_DISASSOCIATED initialization bug could make rescuers fall into infinite loop trying to bind to an offline cpu. Signed-off-by: Tejun Heo Reported-by: CAI Qian --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c8183b235d16..785542976b00 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -196,7 +196,7 @@ typedef cpumask_var_t mayday_mask_t; cpumask_test_and_set_cpu((cpu), (mask)) #define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) #define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) alloc_cpumask_var((maskp), (gfp)) +#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) #define free_mayday_mask(mask) free_cpumask_var((mask)) #else typedef unsigned long mayday_mask_t; -- cgit v1.2.3 From 950eaaca681c44aab87a46225c9e44f902c080aa Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Aug 2010 17:00:18 -0700 Subject: pid: make setpgid() system call use RCU read-side critical section [ 23.584719] [ 23.584720] =================================================== [ 23.585059] [ INFO: suspicious rcu_dereference_check() usage. ] [ 23.585176] --------------------------------------------------- [ 23.585176] kernel/pid.c:419 invoked rcu_dereference_check() without protection! [ 23.585176] [ 23.585176] other info that might help us debug this: [ 23.585176] [ 23.585176] [ 23.585176] rcu_scheduler_active = 1, debug_locks = 1 [ 23.585176] 1 lock held by rc.sysinit/728: [ 23.585176] #0: (tasklist_lock){.+.+..}, at: [] sys_setpgid+0x5f/0x193 [ 23.585176] [ 23.585176] stack backtrace: [ 23.585176] Pid: 728, comm: rc.sysinit Not tainted 2.6.36-rc2 #2 [ 23.585176] Call Trace: [ 23.585176] [] lockdep_rcu_dereference+0x99/0xa2 [ 23.585176] [] find_task_by_pid_ns+0x50/0x6a [ 23.585176] [] find_task_by_vpid+0x1d/0x1f [ 23.585176] [] sys_setpgid+0x67/0x193 [ 23.585176] [] system_call_fastpath+0x16/0x1b [ 24.959669] type=1400 audit(1282938522.956:4): avc: denied { module_request } for pid=766 comm="hwclock" kmod="char-major-10-135" scontext=system_u:system_r:hwclock_t:s0 tcontext=system_u:system_r:kernel_t:s0 tclas It turns out that the setpgid() system call fails to enter an RCU read-side critical section before doing a PID-to-task_struct translation. This commit therefore does rcu_read_lock() before the translation, and also does rcu_read_unlock() after the last use of the returned pointer. Reported-by: Andrew Morton Signed-off-by: Paul E. McKenney Acked-by: David Howells --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e9ad44489828..7f5a0cd296a9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) pgid = pid; if (pgid < 0) return -EINVAL; + rcu_read_lock(); /* From this point forward we keep holding onto the tasklist lock * so that our parent does not change from under us. -DaveM @@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) out: /* All paths lead to here, thus we are safe. -DaveM */ write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); return err; } -- cgit v1.2.3 From ef5dc121d5a0bb1fa477c5395277259f07d318a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 2 Sep 2010 15:48:16 -0700 Subject: mutex: Fix annotations to include it in kernel-locking docbook Fix kernel-doc notation in linux/mutex.h and kernel/mutex.c, then add these 2 files to the kernel-locking docbook as the Mutex API reference chapter. Add one API function to mutex-design.txt and correct a typo in that file. Signed-off-by: Randy Dunlap Cc: Rusty Russell LKML-Reference: <20100902154816.6cc2f9ad.randy.dunlap@oracle.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/kernel-locking.tmpl | 6 ++++++ Documentation/mutex-design.txt | 3 ++- include/linux/mutex.h | 8 ++++++++ kernel/mutex.c | 23 +++++++---------------- 4 files changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl index 0b1a3f97f285..a0d479d1e1dd 100644 --- a/Documentation/DocBook/kernel-locking.tmpl +++ b/Documentation/DocBook/kernel-locking.tmpl @@ -1961,6 +1961,12 @@ machines due to caching. + + Mutex API reference +!Iinclude/linux/mutex.h +!Ekernel/mutex.c + + Further reading diff --git a/Documentation/mutex-design.txt b/Documentation/mutex-design.txt index c91ccc0720fa..38c10fd7f411 100644 --- a/Documentation/mutex-design.txt +++ b/Documentation/mutex-design.txt @@ -9,7 +9,7 @@ firstly, there's nothing wrong with semaphores. But if the simpler mutex semantics are sufficient for your code, then there are a couple of advantages of mutexes: - - 'struct mutex' is smaller on most architectures: .e.g on x86, + - 'struct mutex' is smaller on most architectures: E.g. on x86, 'struct semaphore' is 20 bytes, 'struct mutex' is 16 bytes. A smaller structure size means less RAM footprint, and better CPU-cache utilization. @@ -136,3 +136,4 @@ the APIs of 'struct mutex' have been streamlined: void mutex_lock_nested(struct mutex *lock, unsigned int subclass); int mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); + int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 878cab4f5fcc..f363bc8fdc74 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -78,6 +78,14 @@ struct mutex_waiter { # include #else # define __DEBUG_MUTEX_INITIALIZER(lockname) +/** + * mutex_init - initialize the mutex + * @mutex: the mutex to be initialized + * + * Initialize the mutex to unlocked state. + * + * It is not allowed to initialize an already locked mutex. + */ # define mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 4c0b7b3e6d2e..200407c1502f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -36,15 +36,6 @@ # include #endif -/*** - * mutex_init - initialize the mutex - * @lock: the mutex to be initialized - * @key: the lock_class_key for the class; used by mutex lock debugging - * - * Initialize the mutex to unlocked state. - * - * It is not allowed to initialize an already locked mutex. - */ void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -68,7 +59,7 @@ EXPORT_SYMBOL(__mutex_init); static __used noinline void __sched __mutex_lock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_lock - acquire the mutex * @lock: the mutex to be acquired * @@ -105,7 +96,7 @@ EXPORT_SYMBOL(mutex_lock); static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); -/*** +/** * mutex_unlock - release the mutex * @lock: the mutex to be released * @@ -364,8 +355,8 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count); static noinline int __sched __mutex_lock_interruptible_slowpath(atomic_t *lock_count); -/*** - * mutex_lock_interruptible - acquire the mutex, interruptable +/** + * mutex_lock_interruptible - acquire the mutex, interruptible * @lock: the mutex to be acquired * * Lock the mutex like mutex_lock(), and return 0 if the mutex has @@ -456,15 +447,15 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) return prev == 1; } -/*** - * mutex_trylock - try acquire the mutex, without waiting +/** + * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired * * Try to acquire the mutex atomically. Returns 1 if the mutex * has been acquired successfully, and 0 on contention. * * NOTE: this function follows the spin_trylock() convention, so - * it is negated to the down_trylock() return values! Be careful + * it is negated from the down_trylock() return values! Be careful * about this when converting semaphore users to mutexes. * * This function must not be used in interrupt context. The -- cgit v1.2.3 From b3bd3de66f60df4c9a2076e2886a622458929056 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 10 Aug 2010 14:17:51 -0700 Subject: gcc-4.6: kernel/*: Fix unused but set warnings No real bugs I believe, just some dead code. Signed-off-by: Andi Kleen Cc: Peter Zijlstra Cc: andi@firstfloor.org Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/debug/kdb/kdb_bp.c | 2 -- kernel/hrtimer.c | 3 +-- kernel/sched_fair.c | 3 +-- kernel/sysctl.c | 5 +---- kernel/trace/ring_buffer.c | 2 -- 5 files changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 75bd9b3ebbb7..20059ef4459a 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -274,7 +274,6 @@ static int kdb_bp(int argc, const char **argv) int i, bpno; kdb_bp_t *bp, *bp_check; int diag; - int free; char *symname = NULL; long offset = 0ul; int nextarg; @@ -305,7 +304,6 @@ static int kdb_bp(int argc, const char **argv) /* * Find an empty bp structure to allocate */ - free = KDB_MAXBPT; for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { if (bp->bp_free) break; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ce669174f355..1decafbb6b1a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1091,11 +1091,10 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { - struct hrtimer_clock_base *base; unsigned long flags; ktime_t rem; - base = lock_hrtimer_base(timer, &flags); + lock_hrtimer_base(timer, &flags); rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ab661ebc4895..134f7edb30c6 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1313,7 +1313,7 @@ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu, int load_idx) { - struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; + struct sched_group *idlest = NULL, *group = sd->groups; unsigned long min_load = ULONG_MAX, this_load = 0; int imbalance = 100 + (sd->imbalance_pct-100)/2; @@ -1348,7 +1348,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, if (local_group) { this_load = avg_load; - this = group; } else if (avg_load < min_load) { min_load = avg_load; idlest = group; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ca38e8e3e907..f88552c6d227 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1713,10 +1713,7 @@ static __init int sysctl_init(void) { sysctl_set_parent(NULL, root_table); #ifdef CONFIG_SYSCTL_SYSCALL_CHECK - { - int err; - err = sysctl_check_table(current->nsproxy, root_table); - } + sysctl_check_table(current->nsproxy, root_table); #endif return 0; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 19cccc3c3028..492197e2f86c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2985,13 +2985,11 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) static void rb_advance_iter(struct ring_buffer_iter *iter) { - struct ring_buffer *buffer; struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; unsigned length; cpu_buffer = iter->cpu_buffer; - buffer = cpu_buffer->buffer; /* * Check if we are at the end of the buffer. -- cgit v1.2.3 From da2b71edd8a7db44fe1746261410a981f3e03632 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 23 Aug 2010 13:42:51 -0700 Subject: sched: Move sched_avg_update() to update_cpu_load() Currently sched_avg_update() (which updates rt_avg stats in the rq) is getting called from scale_rt_power() (in the load balance context) which doesn't take rq->lock. Fix it by moving the sched_avg_update() to more appropriate update_cpu_load() where the CFS load gets updated as well. Signed-off-by: Suresh Siddha Signed-off-by: Peter Zijlstra LKML-Reference: <1282596171.2694.3.camel@sbsiddha-MOBL3> Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ kernel/sched_fair.c | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 09b574e7f4df..ed09d4f2a69c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1294,6 +1294,10 @@ static void resched_task(struct task_struct *p) static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } + +static void sched_avg_update(struct rq *rq) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -3182,6 +3186,8 @@ static void update_cpu_load(struct rq *this_rq) this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; } + + sched_avg_update(this_rq); } static void update_cpu_load_active(struct rq *this_rq) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ab661ebc4895..f53ec7550056 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2268,8 +2268,6 @@ unsigned long scale_rt_power(int cpu) struct rq *rq = cpu_rq(cpu); u64 total, available; - sched_avg_update(rq); - total = sched_avg_period() + (rq->clock - rq->age_stamp); available = total - rq->rt_avg; -- cgit v1.2.3 From 85a0fdfd0f967507f3903e8419bc7e408f5a59de Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Thu, 9 Sep 2010 16:37:35 -0700 Subject: gcov: fix null-pointer dereference for certain module types The gcov-kernel infrastructure expects that each object file is loaded only once. This may not be true, e.g. when loading multiple kernel modules which are linked to the same object file. As a result, loading such kernel modules will result in incorrect gcov results while unloading will cause a null-pointer dereference. This patch fixes these problems by changing the gcov-kernel infrastructure so that multiple profiling data sets can be associated with one debugfs entry. It applies to 2.6.36-rc1. Signed-off-by: Peter Oberparleiter Reported-by: Werner Spies Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 244 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 180 insertions(+), 64 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index ef3c3f88a7a3..f83972b16564 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -33,10 +33,11 @@ * @children: child nodes * @all: list head for list of all nodes * @parent: parent node - * @info: associated profiling data structure if not a directory - * @ghost: when an object file containing profiling data is unloaded we keep a - * copy of the profiling data here to allow collecting coverage data - * for cleanup code. Such a node is called a "ghost". + * @loaded_info: array of pointers to profiling data sets for loaded object + * files. + * @num_loaded: number of profiling data sets for loaded object files. + * @unloaded_info: accumulated copy of profiling data sets for unloaded + * object files. Used only when gcov_persist=1. * @dentry: main debugfs entry, either a directory or data file * @links: associated symbolic links * @name: data file basename @@ -51,10 +52,11 @@ struct gcov_node { struct list_head children; struct list_head all; struct gcov_node *parent; - struct gcov_info *info; - struct gcov_info *ghost; + struct gcov_info **loaded_info; + struct gcov_info *unloaded_info; struct dentry *dentry; struct dentry **links; + int num_loaded; char name[0]; }; @@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = { }; /* - * Return the profiling data set for a given node. This can either be the - * original profiling data structure or a duplicate (also called "ghost") - * in case the associated object file has been unloaded. + * Return a profiling data set associated with the given node. This is + * either a data set for a loaded object file or a data set copy in case + * all associated object files have been unloaded. */ static struct gcov_info *get_node_info(struct gcov_node *node) { - if (node->info) - return node->info; + if (node->num_loaded > 0) + return node->loaded_info[0]; - return node->ghost; + return node->unloaded_info; +} + +/* + * Return a newly allocated profiling data set which contains the sum of + * all profiling data associated with the given node. + */ +static struct gcov_info *get_accumulated_info(struct gcov_node *node) +{ + struct gcov_info *info; + int i = 0; + + if (node->unloaded_info) + info = gcov_info_dup(node->unloaded_info); + else + info = gcov_info_dup(node->loaded_info[i++]); + if (!info) + return NULL; + for (; i < node->num_loaded; i++) + gcov_info_add(info, node->loaded_info[i]); + + return info; } /* @@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file) mutex_lock(&node_lock); /* * Read from a profiling data copy to minimize reference tracking - * complexity and concurrent access. + * complexity and concurrent access and to keep accumulating multiple + * profiling data sets associated with one node simple. */ - info = gcov_info_dup(get_node_info(node)); + info = get_accumulated_info(node); if (!info) goto out_unlock; iter = gcov_iter_new(info); @@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name) return NULL; } +/* + * Reset all profiling data associated with the specified node. + */ +static void reset_node(struct gcov_node *node) +{ + int i; + + if (node->unloaded_info) + gcov_info_reset(node->unloaded_info); + for (i = 0; i < node->num_loaded; i++) + gcov_info_reset(node->loaded_info[i]); +} + static void remove_node(struct gcov_node *node); /* * write() implementation for gcov data files. Reset profiling data for the - * associated file. If the object file has been unloaded (i.e. this is - * a "ghost" node), remove the debug fs node as well. + * corresponding file. If all associated object files have been unloaded, + * remove the debug fs node as well. */ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, size_t len, loff_t *pos) @@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr, node = get_node_by_name(info->filename); if (node) { /* Reset counts or remove node for unloaded modules. */ - if (node->ghost) + if (node->num_loaded == 0) remove_node(node); else - gcov_info_reset(node->info); + reset_node(node); } /* Reset counts for open file. */ gcov_info_reset(info); @@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info, INIT_LIST_HEAD(&node->list); INIT_LIST_HEAD(&node->children); INIT_LIST_HEAD(&node->all); - node->info = info; + if (node->loaded_info) { + node->loaded_info[0] = info; + node->num_loaded = 1; + } node->parent = parent; if (name) strcpy(node->name, name); @@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent, struct gcov_node *node; node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); - if (!node) { - pr_warning("out of memory\n"); - return NULL; + if (!node) + goto err_nomem; + if (info) { + node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), + GFP_KERNEL); + if (!node->loaded_info) + goto err_nomem; } init_node(node, info, name, parent); /* Differentiate between gcov data file nodes and directory nodes. */ @@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent, list_add(&node->all, &all_head); return node; + +err_nomem: + kfree(node); + pr_warning("out of memory\n"); + return NULL; } /* Remove symbolic links associated with node. */ @@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node) list_del(&node->all); debugfs_remove(node->dentry); remove_links(node); - if (node->ghost) - gcov_info_free(node->ghost); + kfree(node->loaded_info); + if (node->unloaded_info) + gcov_info_free(node->unloaded_info); kfree(node); } @@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent, /* * write() implementation for reset file. Reset all profiling data to zero - * and remove ghost nodes. + * and remove nodes for which all associated object files are unloaded. */ static ssize_t reset_write(struct file *file, const char __user *addr, size_t len, loff_t *pos) @@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr, mutex_lock(&node_lock); restart: list_for_each_entry(node, &all_head, all) { - if (node->info) - gcov_info_reset(node->info); + if (node->num_loaded > 0) + reset_node(node); else if (list_empty(&node->children)) { remove_node(node); /* Several nodes may have gone - restart loop. */ @@ -564,37 +614,115 @@ err_remove: } /* - * The profiling data set associated with this node is being unloaded. Store a - * copy of the profiling data and turn this node into a "ghost". + * Associate a profiling data set with an existing node. Needs to be called + * with node_lock held. */ -static int ghost_node(struct gcov_node *node) +static void add_info(struct gcov_node *node, struct gcov_info *info) { - node->ghost = gcov_info_dup(node->info); - if (!node->ghost) { - pr_warning("could not save data for '%s' (out of memory)\n", - node->info->filename); - return -ENOMEM; + struct gcov_info **loaded_info; + int num = node->num_loaded; + + /* + * Prepare new array. This is done first to simplify cleanup in + * case the new data set is incompatible, the node only contains + * unloaded data sets and there's not enough memory for the array. + */ + loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); + if (!loaded_info) { + pr_warning("could not add '%s' (out of memory)\n", + info->filename); + return; + } + memcpy(loaded_info, node->loaded_info, + num * sizeof(struct gcov_info *)); + loaded_info[num] = info; + /* Check if the new data set is compatible. */ + if (num == 0) { + /* + * A module was unloaded, modified and reloaded. The new + * data set replaces the copy of the last one. + */ + if (!gcov_info_is_compatible(node->unloaded_info, info)) { + pr_warning("discarding saved data for %s " + "(incompatible version)\n", info->filename); + gcov_info_free(node->unloaded_info); + node->unloaded_info = NULL; + } + } else { + /* + * Two different versions of the same object file are loaded. + * The initial one takes precedence. + */ + if (!gcov_info_is_compatible(node->loaded_info[0], info)) { + pr_warning("could not add '%s' (incompatible " + "version)\n", info->filename); + kfree(loaded_info); + return; + } } - node->info = NULL; + /* Overwrite previous array. */ + kfree(node->loaded_info); + node->loaded_info = loaded_info; + node->num_loaded = num + 1; +} - return 0; +/* + * Return the index of a profiling data set associated with a node. + */ +static int get_info_index(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + for (i = 0; i < node->num_loaded; i++) { + if (node->loaded_info[i] == info) + return i; + } + return -ENOENT; } /* - * Profiling data for this node has been loaded again. Add profiling data - * from previous instantiation and turn this node into a regular node. + * Save the data of a profiling data set which is being unloaded. */ -static void revive_node(struct gcov_node *node, struct gcov_info *info) +static void save_info(struct gcov_node *node, struct gcov_info *info) { - if (gcov_info_is_compatible(node->ghost, info)) - gcov_info_add(info, node->ghost); + if (node->unloaded_info) + gcov_info_add(node->unloaded_info, info); else { - pr_warning("discarding saved data for '%s' (version changed)\n", + node->unloaded_info = gcov_info_dup(info); + if (!node->unloaded_info) { + pr_warning("could not save data for '%s' " + "(out of memory)\n", info->filename); + } + } +} + +/* + * Disassociate a profiling data set from a node. Needs to be called with + * node_lock held. + */ +static void remove_info(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + i = get_info_index(node, info); + if (i < 0) { + pr_warning("could not remove '%s' (not found)\n", info->filename); + return; } - gcov_info_free(node->ghost); - node->ghost = NULL; - node->info = info; + if (gcov_persist) + save_info(node, info); + /* Shrink array. */ + node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; + node->num_loaded--; + if (node->num_loaded > 0) + return; + /* Last loaded data set was removed. */ + kfree(node->loaded_info); + node->loaded_info = NULL; + node->num_loaded = 0; + if (!node->unloaded_info) + remove_node(node); } /* @@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) node = get_node_by_name(info->filename); switch (action) { case GCOV_ADD: - /* Add new node or revive ghost. */ - if (!node) { + if (node) + add_info(node, info); + else add_node(info); - break; - } - if (gcov_persist) - revive_node(node, info); - else { - pr_warning("could not add '%s' (already exists)\n", - info->filename); - } break; case GCOV_REMOVE: - /* Remove node or turn into ghost. */ - if (!node) { + if (node) + remove_info(node, info); + else { pr_warning("could not remove '%s' (not found)\n", info->filename); - break; } - if (gcov_persist) { - if (!ghost_node(node)) - break; - } - remove_node(node); break; } mutex_unlock(&node_lock); -- cgit v1.2.3 From 31583bb0cf6cc40f2a468a4d2f3b9cbefd24f891 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 9 Sep 2010 16:37:37 -0700 Subject: cgroups: fix API thinko Add cgroup_attach_task_all() The existing cgroup_attach_task_current_cg() API is called by a thread to attach another thread to all of its cgroups; this is unsuitable for cases where a privileged task wants to attach itself to the cgroups of a less privileged one, since the call must be made from the context of the target task. This patch adds a more generic cgroup_attach_task_all() API that allows both the source task and to-be-moved task to be specified. cgroup_attach_task_current_cg() becomes a specialization of the more generic new function. [menage@google.com: rewrote changelog] [akpm@linux-foundation.org: address reviewer comments] Signed-off-by: Michael S. Tsirkin Tested-by: Alex Williamson Acked-by: Paul Menage Cc: Li Zefan Cc: Ben Blum Cc: Sridhar Samudrala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 12 +++++++++++- kernel/cgroup.c | 13 +++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ed3e92e41c6e..0c991023ee47 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -578,7 +578,12 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task(struct cgroup *, struct task_struct *); -int cgroup_attach_task_current_cg(struct task_struct *); +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); + +static inline int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + return cgroup_attach_task_all(current, tsk); +} /* * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works @@ -636,6 +641,11 @@ static inline int cgroupstats_build(struct cgroupstats *stats, } /* No cgroups - nothing to do */ +static inline int cgroup_attach_task_all(struct task_struct *from, + struct task_struct *t) +{ + return 0; +} static inline int cgroup_attach_task_current_cg(struct task_struct *t) { return 0; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 192f88c5b0f9..c9483d8f6140 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1791,19 +1791,20 @@ out: } /** - * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task * @tsk: the task to be attached */ -int cgroup_attach_task_current_cg(struct task_struct *tsk) +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) { struct cgroupfs_root *root; - struct cgroup *cur_cg; int retval = 0; cgroup_lock(); for_each_active_root(root) { - cur_cg = task_cgroup_from_root(current, root); - retval = cgroup_attach_task(cur_cg, tsk); + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk); if (retval) break; } @@ -1811,7 +1812,7 @@ int cgroup_attach_task_current_cg(struct task_struct *tsk) return retval; } -EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex -- cgit v1.2.3 From 1c24de60e50fb19b94d94225458da17c720f0729 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Thu, 9 Sep 2010 16:37:59 -0700 Subject: kernel/groups.c: fix integer overflow in groups_search gid_t is a unsigned int. If group_info contains a gid greater than MAX_INT, groups_search() function may look on the wrong side of the search tree. This solves some unfair "permission denied" problems. Signed-off-by: Jerome Marchand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/groups.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/groups.c b/kernel/groups.c index 53b1916c9492..253dc0f35cf4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp) right = group_info->ngroups; while (left < right) { unsigned int mid = (left+right)/2; - int cmp = grp - GROUP_AT(group_info, mid); - if (cmp > 0) + if (grp > GROUP_AT(group_info, mid)) left = mid + 1; - else if (cmp < 0) + else if (grp < GROUP_AT(group_info, mid)) right = mid; else return 1; -- cgit v1.2.3 From 910321ea817a202ff70fac666e37e2c8e2f88823 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 9 Sep 2010 16:38:07 -0700 Subject: swap: revert special hibernation allocation Please revert 2.6.36-rc commit d2997b1042ec150616c1963b5e5e919ffd0b0ebf "hibernation: freeze swap at hibernation". It complicated matters by adding a second swap allocation path, just for hibernation; without in any way fixing the issue that it was intended to address - page reclaim after fixing the hibernation image might free swap from a page already imaged as swapcache, letting its swap be reallocated to store a different page of the image: resulting in data corruption if the imaged page were freed as clean then swapped back in. Pages freed to si->swap_map were still in danger of being reallocated by the alternative allocation path. I guess it inadvertently fixed slow SSD swap allocation for hibernation, as reported by Nigel Cunningham: by missing out the discards that occur on the usual swap allocation path; but that was unintentional, and needs a separate fix. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: "Rafael J. Wysocki" Cc: Ondrej Zary Cc: Andrea Gelmini Cc: Balbir Singh Cc: Andrea Arcangeli Cc: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 8 +---- kernel/power/hibernate.c | 1 - kernel/power/snapshot.c | 1 - kernel/power/swap.c | 6 ++-- mm/swapfile.c | 94 ++++++++++++------------------------------------ 5 files changed, 26 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2fee51a11b73..bf4eb62506db 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -315,6 +315,7 @@ extern long nr_swap_pages; extern long total_swap_pages; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); +extern swp_entry_t get_swap_page_of_type(int); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); @@ -331,13 +332,6 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -#ifdef CONFIG_HIBERNATION -void hibernation_freeze_swap(void); -void hibernation_thaw_swap(void); -swp_entry_t get_swap_for_hibernation(int type); -void swap_free_for_hibernation(swp_entry_t val); -#endif - /* linux/mm/thrash.c */ extern struct mm_struct *swap_token_mm; extern void grab_swap_token(struct mm_struct *); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c77963938bca..8dc31e02ae12 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -338,7 +338,6 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); - hibernation_freeze_swap(); saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5e7edfb05e66..f6cd6faf84fd 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1086,7 +1086,6 @@ void swsusp_free(void) buffer = NULL; alloc_normal = 0; alloc_highmem = 0; - hibernation_thaw_swap(); } /* Helper functions used for the shrinking of memory. */ diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 5d0059eed3e4..e6a5bdf61a37 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -136,10 +136,10 @@ sector_t alloc_swapdev_block(int swap) { unsigned long offset; - offset = swp_offset(get_swap_for_hibernation(swap)); + offset = swp_offset(get_swap_page_of_type(swap)); if (offset) { if (swsusp_extents_insert(offset)) - swap_free_for_hibernation(swp_entry(swap, offset)); + swap_free(swp_entry(swap, offset)); else return swapdev_block(swap, offset); } @@ -163,7 +163,7 @@ void free_all_swap_pages(int swap) ext = container_of(node, struct swsusp_extent, node); rb_erase(node, &swsusp_extents); for (offset = ext->start; offset <= ext->end; offset++) - swap_free_for_hibernation(swp_entry(swap, offset)); + swap_free(swp_entry(swap, offset)); kfree(ext); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 1f3f9c59a73a..f08d165871b3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -47,8 +47,6 @@ long nr_swap_pages; long total_swap_pages; static int least_priority; -static bool swap_for_hibernation; - static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; @@ -453,8 +451,6 @@ swp_entry_t get_swap_page(void) spin_lock(&swap_lock); if (nr_swap_pages <= 0) goto noswap; - if (swap_for_hibernation) - goto noswap; nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { @@ -487,6 +483,28 @@ noswap: return (swp_entry_t) {0}; } +/* The only caller of this function is now susupend routine */ +swp_entry_t get_swap_page_of_type(int type) +{ + struct swap_info_struct *si; + pgoff_t offset; + + spin_lock(&swap_lock); + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { + nr_swap_pages--; + /* This is called for allocating swap entry, not cache */ + offset = scan_swap_map(si, 1); + if (offset) { + spin_unlock(&swap_lock); + return swp_entry(type, offset); + } + nr_swap_pages++; + } + spin_unlock(&swap_lock); + return (swp_entry_t) {0}; +} + static struct swap_info_struct *swap_info_get(swp_entry_t entry) { struct swap_info_struct *p; @@ -746,74 +764,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) #endif #ifdef CONFIG_HIBERNATION - -static pgoff_t hibernation_offset[MAX_SWAPFILES]; -/* - * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise, - * saved swap_map[] image to the disk will be an incomplete because it's - * changing without synchronization with hibernation snap shot. - * At resume, we just make swap_for_hibernation=false. We can forget - * used maps easily. - */ -void hibernation_freeze_swap(void) -{ - int i; - - spin_lock(&swap_lock); - - printk(KERN_INFO "PM: Freeze Swap\n"); - swap_for_hibernation = true; - for (i = 0; i < MAX_SWAPFILES; i++) - hibernation_offset[i] = 1; - spin_unlock(&swap_lock); -} - -void hibernation_thaw_swap(void) -{ - spin_lock(&swap_lock); - if (swap_for_hibernation) { - printk(KERN_INFO "PM: Thaw Swap\n"); - swap_for_hibernation = false; - } - spin_unlock(&swap_lock); -} - -/* - * Because updateing swap_map[] can make not-saved-status-change, - * we use our own easy allocator. - * Please see kernel/power/swap.c, Used swaps are recorded into - * RB-tree. - */ -swp_entry_t get_swap_for_hibernation(int type) -{ - pgoff_t off; - swp_entry_t val = {0}; - struct swap_info_struct *si; - - spin_lock(&swap_lock); - - si = swap_info[type]; - if (!si || !(si->flags & SWP_WRITEOK)) - goto done; - - for (off = hibernation_offset[type]; off < si->max; ++off) { - if (!si->swap_map[off]) - break; - } - if (off < si->max) { - val = swp_entry(type, off); - hibernation_offset[type] = off + 1; - } -done: - spin_unlock(&swap_lock); - return val; -} - -void swap_free_for_hibernation(swp_entry_t ent) -{ - /* Nothing to do */ -} - /* * Find the swap type that corresponds to given device (if any). * -- cgit v1.2.3 From df09162550fbb53354f0c88e85b5d0e6129ee9cc Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 9 Sep 2010 16:34:59 -0700 Subject: tracing: t_start: reset FTRACE_ITER_HASH in case of seek/pread Be sure to avoid entering t_show() with FTRACE_ITER_HASH set without having properly started the iterator to iterate the hash. This case is degenerate and, as discovered by Robert Swiecki, can cause t_hash_show() to misuse a pointer. This causes a NULL ptr deref with possible security implications. Tracked as CVE-2010-3079. Cc: Robert Swiecki Cc: Eugene Teo Cc: Signed-off-by: Chris Wright Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83a16e9ee518..fa7ece649fe1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1510,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (*pos > 0) return t_hash_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; return iter; } -- cgit v1.2.3 From 0109c2c48d062a04685638926a35ed20153fedc8 Mon Sep 17 00:00:00 2001 From: mark gross Date: Thu, 9 Sep 2010 23:20:09 +0200 Subject: PM QoS: Correct pr_debug() misuse and improve parameter checks Correct some pr_debug() misuse and add a stronger parameter check to pm_qos_write() for the ASCII hex value case. Thanks to Dan Carpenter for pointing out the problem! Signed-off-by: mark gross Signed-off-by: Rafael J. Wysocki --- kernel/pm_qos_params.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index b7e4c362361b..645e541a45f6 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -389,10 +389,12 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, } else if (count == 11) { /* len('0x12345678/0') */ if (copy_from_user(ascii_value, buf, 11)) return -EFAULT; + if (strlen(ascii_value) != 10) + return -EINVAL; x = sscanf(ascii_value, "%x", &value); if (x != 1) return -EINVAL; - pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value); + pr_debug("%s, %d, 0x%x\n", ascii_value, x, value); } else return -EINVAL; -- cgit v1.2.3 From 6715045ddc7472a22be5e49d4047d2d89b391f45 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Sep 2010 20:58:27 +0200 Subject: PM / Hibernate: Avoid hitting OOM during preallocation of memory There is a problem in hibernate_preallocate_memory() that it calls preallocate_image_memory() with an argument that may be greater than the total number of available non-highmem memory pages. If that's the case, the OOM condition is guaranteed to trigger, which in turn can cause significant slowdown to occur during hibernation. To avoid that, make preallocate_image_memory() adjust its argument before calling preallocate_image_pages(), so that the total number of saveable non-highem pages left is not less than the minimum size of a hibernation image. Change hibernate_preallocate_memory() to try to allocate from highmem if the number of pages allocated by preallocate_image_memory() is too low. Modify free_unnecessary_pages() to take all possible memory allocation patterns into account. Reported-by: KOSAKI Motohiro Signed-off-by: Rafael J. Wysocki Tested-by: M. Vefa Bicakci --- kernel/power/snapshot.c | 85 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 65 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5e7edfb05e66..5209b39e6982 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1122,9 +1122,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) return nr_alloc; } -static unsigned long preallocate_image_memory(unsigned long nr_pages) +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) { - return preallocate_image_pages(nr_pages, GFP_IMAGE); + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); } #ifdef CONFIG_HIGHMEM @@ -1170,15 +1180,22 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, */ static void free_unnecessary_pages(void) { - unsigned long save_highmem, to_free_normal, to_free_highmem; + unsigned long save, to_free_normal, to_free_highmem; - to_free_normal = alloc_normal - count_data_pages(); - save_highmem = count_highmem_pages(); - if (alloc_highmem > save_highmem) { - to_free_highmem = alloc_highmem - save_highmem; + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; } else { to_free_highmem = 0; - to_free_normal -= save_highmem - alloc_highmem; + to_free_normal -= save - alloc_highmem; } memory_bm_position_reset(©_bm); @@ -1259,7 +1276,7 @@ int hibernate_preallocate_memory(void) { struct zone *zone; unsigned long saveable, size, max_size, count, highmem, pages = 0; - unsigned long alloc, save_highmem, pages_highmem; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; struct timeval start, stop; int error; @@ -1296,6 +1313,7 @@ int hibernate_preallocate_memory(void) else count += zone_page_state(zone, NR_FREE_PAGES); } + avail_normal = count; count += highmem; count -= totalreserve_pages; @@ -1310,12 +1328,21 @@ int hibernate_preallocate_memory(void) */ if (size >= saveable) { pages = preallocate_image_highmem(save_highmem); - pages += preallocate_image_memory(saveable - pages); + pages += preallocate_image_memory(saveable - pages, avail_normal); goto out; } /* Estimate the minimum size of the image. */ pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; if (size < pages) size = min_t(unsigned long, pages, max_size); @@ -1336,16 +1363,34 @@ int hibernate_preallocate_memory(void) */ pages_highmem = preallocate_image_highmem(highmem / 2); alloc = (count - max_size) - pages_highmem; - pages = preallocate_image_memory(alloc); - if (pages < alloc) - goto err_out; - size = max_size - size; - alloc = size; - size = preallocate_highmem_fraction(size, highmem, count); - pages_highmem += size; - alloc -= size; - pages += preallocate_image_memory(alloc); - pages += pages_highmem; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) + goto err_out; + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } /* * We only need as many page frames for the image as there are saveable -- cgit v1.2.3 From 0bf377bbb0bea6130f35613491887cc622e42a8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Sep 2010 08:14:52 +0200 Subject: sched: Improve latencies under load by decreasing minimum scheduling granularity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mathieu reported bad latencies with make -j10 kind of kbuild workloads - which is mostly caused by us scheduling with a too coarse granularity. Reduce the minimum granularity some more, to make sure we can meet the latency target. I got the following results (make -j10 kbuild load, average of 3 runs): vanilla: maximum latency: 38278.9 µs average latency: 7730.1 µs patched: maximum latency: 22702.1 µs average latency: 6684.8 µs Mathieu also measured it: | | * wakeup-latency.c (SIGEV_THREAD) with make -j10 | | - Mainline 2.6.35.2 kernel | | maximum latency: 45762.1 µs | average latency: 7348.6 µs | | - With only Peter's smaller min_gran (shown below): | | maximum latency: 29100.6 µs | average latency: 6684.1 µs | Reported-by: Mathieu Desnoyers Reported-by: Linus Torvalds Acked-by: Mathieu Desnoyers Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9b5b4f86b742..a171138a9402 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -54,13 +54,13 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling * Minimal preemption granularity for CPU-bound tasks: * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 2000000ULL; -unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL; +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -static unsigned int sched_nr_latency = 3; +static unsigned int sched_nr_latency = 8; /* * After fork, child runs first. If set to 0 (default) then -- cgit v1.2.3 From 2bccfffd1538f3523847583213567e2f7ce00926 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Sep 2010 08:43:22 -0400 Subject: tracing: Do not reset *pos in set_ftrace_filter After the filtered functions are read, the probed functions are read from the hash in set_ftrace_filter. When the hashed probed functions are read, the *pos passed in is reset. Instead of modifying the pos given to the read function, just record the pos where the filtered functions ended and subtract from that. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..585ea27025b1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,6 +1368,7 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { + loff_t func_pos; struct ftrace_page *pg; int hidx; int idx; @@ -1418,12 +1419,15 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) loff_t l; if (!(iter->flags & FTRACE_ITER_HASH)) - *pos = 0; + iter->func_pos = *pos; + + if (iter->func_pos > *pos) + return NULL; iter->flags |= FTRACE_ITER_HASH; iter->hidx = 0; - for (l = 0; l <= *pos; ) { + for (l = 0; l <= (*pos - iter->func_pos); ) { p = t_hash_next(m, p, &l); if (!p) break; -- cgit v1.2.3 From 4aeb69672d011fac5c8df671f3ca89f7987c104e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Sep 2010 10:00:28 -0400 Subject: tracing: Replace typecasted void pointer in set_ftrace_filter code The set_ftrace_filter uses seq_file and reads from two lists. The pointer returned by t_next() can either be of type struct dyn_ftrace or struct ftrace_func_probe. If there is a bug (there was one) the wrong pointer may be used and the reference can cause an oops. This patch makes t_next() and friends only return the iterator structure which now has a pointer of type struct dyn_ftrace and struct ftrace_func_probe. The t_show() can now test if the pointer is NULL or not and if the pointer exists, it is guaranteed to be of the correct type. Now if there's a bug, only wrong data will be shown but not an oops. Cc: Chris Wright Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 67 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 585ea27025b1..c8db0dbb984e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,25 +1368,29 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - loff_t func_pos; - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - struct trace_parser parser; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); (*pos)++; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1409,7 +1413,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) @@ -1428,19 +1437,24 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) iter->hidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, p, &l); + p = t_hash_next(m, &l); if (!p) break; } - return p; + if (!p) + return NULL; + + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1461,7 +1475,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; @@ -1495,7 +1509,12 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return NULL; + + iter->func = rec; + + return iter; } static void *t_start(struct seq_file *m, loff_t *pos) @@ -1530,10 +1549,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p && iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - return p; + return NULL; + } + + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1544,16 +1567,18 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; -- cgit v1.2.3 From 98c4fd046f07156ca6055677e8f03d4280be16c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Sep 2010 11:47:43 -0400 Subject: tracing: Keep track of set_ftrace_filter position and allow lseek again This patch keeps track of the index within the elements of set_ftrace_filter and if the position goes backwards, it nicely resets and starts from the beginning again. This allows for lseek and pread to work properly now. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c8db0dbb984e..2d51166b93fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,6 +1368,7 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { + loff_t pos; loff_t func_pos; struct ftrace_page *pg; struct dyn_ftrace *func; @@ -1385,9 +1386,8 @@ t_hash_next(struct seq_file *m, loff_t *pos) struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; if (iter->probe) hnd = &iter->probe->node; @@ -1427,14 +1427,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_HASH)) - iter->func_pos = *pos; - if (iter->func_pos > *pos) return NULL; - iter->flags |= FTRACE_ITER_HASH; - iter->hidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { p = t_hash_next(m, &l); @@ -1444,6 +1439,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) if (!p) return NULL; + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + return iter; } @@ -1478,6 +1476,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; + iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return NULL; @@ -1517,6 +1517,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return iter; } +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); +} + static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; @@ -1524,6 +1531,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) loff_t l; mutex_lock(&ftrace_lock); + /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all @@ -1541,6 +1554,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { @@ -2447,7 +2465,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = no_llseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; -- cgit v1.2.3 From 57c072c7113f54f9512624d6c665db6184448782 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Sep 2010 11:21:11 -0400 Subject: tracing: Fix reading of set_ftrace_filter across lists If we do: # cd /sys/kernel/debug # echo 'do_IRQ:traceon schedule:traceon sys_write:traceon' > \ set_ftrace_filter # cat set_ftrace_filter We get the following output: #### all functions enabled #### sys_write:traceon:unlimited schedule:traceon:unlimited do_IRQ:traceon:unlimited This outputs two lists. One is the fact that all functions are currently enabled for function tracing, the other has three probed functions, which happen to have 'traceon' as their commands. Currently, when reading the first list (functions enabled) the seq_file code will receive a "NULL" from the t_next() function causing it to exit early. This makes "read()" from userspace stop reading the code at this boarder. Although read is allowed to do this, some (broken) applications might consider this an end of file and stop early. This patch adds the start of the second list to t_next() when it finishes the first list. It is a simple change and gives the set_ftrace_filter file nicer reading ability. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d51166b93fe..1884cf5bc110 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1477,10 +1477,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; iter->pos = *pos; - iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1510,8 +1509,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } if (!rec) - return NULL; + return t_hash_start(m, pos); + iter->func_pos = *pos; iter->func = rec; return iter; -- cgit v1.2.3