From 4673247562e39a17e09440fa1400819522ccd446 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 7 Jun 2010 17:53:51 +0200 Subject: genirq: Deal with desc->set_type() changing desc->chip The set_type() function can change the chip implementation when the trigger mode changes. That might result in using an non-initialized irq chip when called from __setup_irq() or when called via set_irq_type() on an already enabled irq. The set_irq_type() function should not be called on an enabled irq, but because we forgot to put a check into it, we have a bunch of users which grew the habit of doing that and it never blew up as the function is serialized via desc->lock against all users of desc->chip and they never hit the non-initialized irq chip issue. The easy fix for the __setup_irq() issue would be to move the irq_chip_set_defaults(desc->chip) call after the trigger setting to make sure that a chip change is covered. But as we have already users, which do the type setting after request_irq(), the safe fix for now is to call irq_chip_set_defaults() from __irq_set_trigger() when desc->set_type() changed the irq chip. It needs a deeper analysis whether we should refuse to change the chip on an already enabled irq, but that'd be a large scale change to fix all the existing users. So that's neither stable nor 2.6.35 material. Reported-by: Esben Haabendal Signed-off-by: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: linuxppc-dev Cc: stable@kernel.org --- kernel/irq/manage.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3164ba7ce151..e1497481fe8a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; + + if (chip != desc->chip) + irq_chip_set_defaults(desc->chip); } return ret; -- cgit v1.2.3 From dd4c4f17d722ffeb2515bf781400675a30fcead7 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Fri, 28 May 2010 16:32:14 -0400 Subject: suspend: Move NVS save/restore code to generic suspend functionality Saving platform non-volatile state may be required for suspend to RAM as well as hibernation. Move it to more generic code. Signed-off-by: Matthew Garrett Acked-by: Rafael J. Wysocki Tested-by: Maxim Levitsky Signed-off-by: Len Brown --- arch/x86/kernel/e820.c | 2 +- drivers/acpi/sleep.c | 12 ++-- include/linux/suspend.h | 26 ++++----- kernel/power/Kconfig | 9 +-- kernel/power/Makefile | 2 +- kernel/power/hibernate_nvs.c | 136 ------------------------------------------- kernel/power/nvs.c | 136 +++++++++++++++++++++++++++++++++++++++++++ kernel/power/suspend.c | 6 ++ 8 files changed, 168 insertions(+), 161 deletions(-) delete mode 100644 kernel/power/hibernate_nvs.c create mode 100644 kernel/power/nvs.c (limited to 'kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7bca3c6a02fb..0d6fc71bedb1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - hibernate_nvs_register(ei->addr, ei->size); + suspend_nvs_register(ei->addr, ei->size); } return 0; diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 4ab2275b4461..bcaa6efa8136 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -393,7 +393,7 @@ static int acpi_hibernation_begin(void) { int error; - error = s4_no_nvs ? 0 : hibernate_nvs_alloc(); + error = s4_no_nvs ? 0 : suspend_nvs_alloc(); if (!error) { acpi_target_sleep_state = ACPI_STATE_S4; acpi_sleep_tts_switch(acpi_target_sleep_state); @@ -407,7 +407,7 @@ static int acpi_hibernation_pre_snapshot(void) int error = acpi_pm_prepare(); if (!error) - hibernate_nvs_save(); + suspend_nvs_save(); return error; } @@ -432,7 +432,7 @@ static int acpi_hibernation_enter(void) static void acpi_hibernation_finish(void) { - hibernate_nvs_free(); + suspend_nvs_free(); acpi_pm_finish(); } @@ -452,7 +452,7 @@ static void acpi_hibernation_leave(void) panic("ACPI S4 hardware signature mismatch"); } /* Restore the NVS memory area */ - hibernate_nvs_restore(); + suspend_nvs_restore(); } static int acpi_pm_pre_restore(void) @@ -501,7 +501,7 @@ static int acpi_hibernation_begin_old(void) if (!error) { if (!s4_no_nvs) - error = hibernate_nvs_alloc(); + error = suspend_nvs_alloc(); if (!error) acpi_target_sleep_state = ACPI_STATE_S4; } @@ -513,7 +513,7 @@ static int acpi_hibernation_pre_snapshot_old(void) int error = acpi_pm_disable_gpes(); if (!error) - hibernate_nvs_save(); + suspend_nvs_save(); return error; } diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 5e781d824e6d..bc7d6bb4cd8e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -256,22 +256,22 @@ static inline int hibernate(void) { return -ENOSYS; } static inline bool system_entering_hibernation(void) { return false; } #endif /* CONFIG_HIBERNATION */ -#ifdef CONFIG_HIBERNATION_NVS -extern int hibernate_nvs_register(unsigned long start, unsigned long size); -extern int hibernate_nvs_alloc(void); -extern void hibernate_nvs_free(void); -extern void hibernate_nvs_save(void); -extern void hibernate_nvs_restore(void); -#else /* CONFIG_HIBERNATION_NVS */ -static inline int hibernate_nvs_register(unsigned long a, unsigned long b) +#ifdef CONFIG_SUSPEND_NVS +extern int suspend_nvs_register(unsigned long start, unsigned long size); +extern int suspend_nvs_alloc(void); +extern void suspend_nvs_free(void); +extern void suspend_nvs_save(void); +extern void suspend_nvs_restore(void); +#else /* CONFIG_SUSPEND_NVS */ +static inline int suspend_nvs_register(unsigned long a, unsigned long b) { return 0; } -static inline int hibernate_nvs_alloc(void) { return 0; } -static inline void hibernate_nvs_free(void) {} -static inline void hibernate_nvs_save(void) {} -static inline void hibernate_nvs_restore(void) {} -#endif /* CONFIG_HIBERNATION_NVS */ +static inline int suspend_nvs_alloc(void) { return 0; } +static inline void suspend_nvs_free(void) {} +static inline void suspend_nvs_save(void) {} +static inline void suspend_nvs_restore(void) {} +#endif /* CONFIG_SUSPEND_NVS */ #ifdef CONFIG_PM_SLEEP void save_processor_state(void); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5c36ea9d55d2..ca6066a6952e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n +config SUSPEND_NVS + bool + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE + select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -130,13 +134,10 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. -config HIBERNATION_NVS - bool - config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE - select HIBERNATION_NVS if HAS_IOMEM + select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 524e058dcf06..f9063c6b185d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o +obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/hibernate_nvs.c deleted file mode 100644 index fdcad9ed5a7b..000000000000 --- a/kernel/power/hibernate_nvs.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory - * - * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. - * - * This file is released under the GPLv2. - */ - -#include -#include -#include -#include -#include -#include - -/* - * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent - * resume. The code below implements a mechanism allowing us to do that. - */ - -struct nvs_page { - unsigned long phys_start; - unsigned int size; - void *kaddr; - void *data; - struct list_head node; -}; - -static LIST_HEAD(nvs_list); - -/** - * hibernate_nvs_register - register platform NVS memory region to save - * @start - physical address of the region - * @size - size of the region - * - * The NVS region need not be page-aligned (both ends) and we arrange - * things so that the data from page-aligned addresses in this region will - * be copied into separate RAM pages. - */ -int hibernate_nvs_register(unsigned long start, unsigned long size) -{ - struct nvs_page *entry, *next; - - while (size > 0) { - unsigned int nr_bytes; - - entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); - if (!entry) - goto Error; - - list_add_tail(&entry->node, &nvs_list); - entry->phys_start = start; - nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); - entry->size = (size < nr_bytes) ? size : nr_bytes; - - start += entry->size; - size -= entry->size; - } - return 0; - - Error: - list_for_each_entry_safe(entry, next, &nvs_list, node) { - list_del(&entry->node); - kfree(entry); - } - return -ENOMEM; -} - -/** - * hibernate_nvs_free - free data pages allocated for saving NVS regions - */ -void hibernate_nvs_free(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - free_page((unsigned long)entry->data); - entry->data = NULL; - if (entry->kaddr) { - iounmap(entry->kaddr); - entry->kaddr = NULL; - } - } -} - -/** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions - */ -int hibernate_nvs_alloc(void) -{ - struct nvs_page *entry; - - list_for_each_entry(entry, &nvs_list, node) { - entry->data = (void *)__get_free_page(GFP_KERNEL); - if (!entry->data) { - hibernate_nvs_free(); - return -ENOMEM; - } - } - return 0; -} - -/** - * hibernate_nvs_save - save NVS memory regions - */ -void hibernate_nvs_save(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Saving platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) { - entry->kaddr = ioremap(entry->phys_start, entry->size); - memcpy(entry->data, entry->kaddr, entry->size); - } -} - -/** - * hibernate_nvs_restore - restore NVS memory regions - * - * This function is going to be called with interrupts disabled, so it - * cannot iounmap the virtual addresses used to access the NVS region. - */ -void hibernate_nvs_restore(void) -{ - struct nvs_page *entry; - - printk(KERN_INFO "PM: Restoring platform NVS memory\n"); - - list_for_each_entry(entry, &nvs_list, node) - if (entry->data) - memcpy(entry->kaddr, entry->data, entry->size); -} diff --git a/kernel/power/nvs.c b/kernel/power/nvs.c new file mode 100644 index 000000000000..1836db60bbb6 --- /dev/null +++ b/kernel/power/nvs.c @@ -0,0 +1,136 @@ +/* + * linux/kernel/power/hibernate_nvs.c - Routines for handling NVS memory + * + * Copyright (C) 2008,2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * suspend and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * suspend_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int suspend_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * suspend_nvs_free - free data pages allocated for saving NVS regions + */ +void suspend_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int suspend_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + suspend_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * suspend_nvs_save - save NVS memory regions + */ +void suspend_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * suspend_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void suspend_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 56e7dbb8b996..f37cb7dd4402 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -16,6 +16,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include "power.h" -- cgit v1.2.3 From a8fb2608053547bc3152ea61a5ec7cdfce5d942c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 10 Jun 2010 14:53:16 -0400 Subject: perf/tracing: Fix regression of perf losing kprobe events With the addition of the code to shrink the kernel tracepoint infrastructure, we lost kprobes being traced by perf. The reason is that I tested if the "tp_event->class->perf_probe" existed before enabling it. This prevents "ftrace only" events (like the function trace events) from being enabled by perf. Unfortunately, kprobe events do not use perf_probe. This causes kprobes to be missed by perf. To fix this, we add the test to see if "tp_event->class->reg" exists as well as perf_probe. Normal trace events have only "perf_probe" but no "reg" function, and kprobes and syscalls have the "reg" but no "perf_probe". The ftrace unique events do not have either, so this is a valid test. If a kprobe or syscall is not to be probed by perf, the "reg" function is called anyway, and will return a failure and prevent perf from probing it. Reported-by: Srikar Dronamraju Tested-by: Srikar Dronamraju Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index e6f65887842c..8a2b73f7c068 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -96,7 +96,9 @@ int perf_trace_init(struct perf_event *p_event) mutex_lock(&event_mutex); list_for_each_entry(tp_event, &ftrace_events, list) { if (tp_event->event.type == event_id && - tp_event->class && tp_event->class->perf_probe && + tp_event->class && + (tp_event->class->perf_probe || + tp_event->class->reg) && try_module_get(tp_event->mod)) { ret = perf_trace_event_init(tp_event, p_event); break; -- cgit v1.2.3 From 3310d4d38fbc514e7b18bd3b1eea8effdd63b5aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Jun 2010 18:02:37 +0200 Subject: nohz: Fix nohz ratelimit Chris Wedgwood reports that 39c0cbe (sched: Rate-limit nohz) causes a serial console regression, unresponsiveness, and indeed it does. The reason is that the nohz code is skipped even when the tick was already stopped before the nohz_ratelimit(cpu) condition changed. Move the nohz_ratelimit() check to the other conditions which prevent long idle sleeps. Reported-by: Chris Wedgwood Tested-by: Brian Bloniarz Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Jiri Kosina Cc: Linus Torvalds Cc: Greg KH Cc: Alan Cox Cc: OGAWA Hirofumi Cc: Jef Driesen LKML-Reference: <1276790557.27822.516.camel@twins> Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c034..783fbadf2202 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; } - if (nohz_ratelimit(cpu)) - goto end; - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { @@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { -- cgit v1.2.3 From 3c93717cfa51316e4dbb471e7c0f9d243359d5f8 Mon Sep 17 00:00:00 2001 From: "Alex,Shi" Date: Thu, 17 Jun 2010 14:08:13 +0800 Subject: sched: Fix over-scheduling bug Commit e70971591 ("sched: Optimize unused cgroup configuration") introduced an imbalanced scheduling bug. If we do not use CGROUP, function update_h_load won't update h_load. When the system has a large number of tasks far more than logical CPU number, the incorrect cfs_rq[cpu]->h_load value will cause load_balance() to pull too many tasks to the local CPU from the busiest CPU. So the busiest CPU keeps going in a round robin. That will hurt performance. The issue was found originally by a scientific calculation workload that developed by Yanmin. With that commit, the workload performance drops about 40%. CPU before after 00 : 2 : 7 01 : 1 : 7 02 : 11 : 6 03 : 12 : 7 04 : 6 : 6 05 : 11 : 7 06 : 10 : 6 07 : 12 : 7 08 : 11 : 6 09 : 12 : 6 10 : 1 : 6 11 : 1 : 6 12 : 6 : 6 13 : 2 : 6 14 : 2 : 6 15 : 1 : 6 Reviewed-by: Yanmin zhang Signed-off-by: Alex Shi Signed-off-by: Peter Zijlstra LKML-Reference: <1276754893.9452.5442.camel@debian> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 2aaceebd484c..6c9e7c8735bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1657,9 +1657,6 @@ static void update_shares(struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -- cgit v1.2.3 From f3b577dec1f2ce32d2db6d2ca6badff7002512af Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Tue, 1 Jun 2010 14:06:13 +0100 Subject: rcu: apply RCU protection to wake_affine() The task_group() function returns a pointer that must be protected by either RCU, the ->alloc_lock, or the cgroup lock (see the rcu_dereference_check() in task_subsys_state(), which is invoked by task_group()). The wake_affine() function currently does none of these, which means that a concurrent update would be within its rights to free the structure returned by task_group(). Because wake_affine() uses this structure only to compute load-balancing heuristics, there is no reason to acquire either of the two locks. Therefore, this commit introduces an RCU read-side critical section that starts before the first call to task_group() and ends after the last use of the "tg" pointer returned from task_group(). Thanks to Li Zefan for pointing out the need to extend the RCU read-side critical section from that proposed by the original patch. Signed-off-by: Daniel J Blueman Signed-off-by: Paul E. McKenney --- kernel/sched_fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index eed35eded602..a878b5332daa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1240,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ + rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1275,6 +1276,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) balanced = this_eff_load <= prev_eff_load; } else balanced = true; + rcu_read_unlock(); /* * If the currently running task will sleep within -- cgit v1.2.3 From 8695159967957015f8dfb49315d6f88e111d90e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Jun 2010 11:44:53 +0200 Subject: sched: silence PROVE_RCU in sched_fork() Because cgroup_fork() is ran before sched_fork() [ from copy_process() ] and the child's pid is not yet visible the child is pinned to its cgroup. Therefore we can silence this warning. A nicer solution would be moving cgroup_fork() to right after dup_task_struct() and exclude PF_STARTING from task_subsys_state(). Signed-off-by: Peter Zijlstra Reviewed-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/sched.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f8b8996228dd..a2d215d132f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2494,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) -- cgit v1.2.3 From 0d98bb2656e9bd2dfda2d089db1fe1dbdab41504 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Mon, 24 May 2010 12:11:43 -0700 Subject: sched: Prevent compiler from optimising the sched_avg_update() loop GCC 4.4.1 on ARM has been observed to replace the while loop in sched_avg_update with a call to uldivmod, resulting in the following build failure at link-time: kernel/built-in.o: In function `sched_avg_update': kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' kernel/sched.c:1261: undefined reference to `__aeabi_uldivmod' make: *** [.tmp_vmlinux1] Error 1 This patch introduces a fake data hazard to the loop body to prevent the compiler optimising the loop away. Signed-off-by: Will Deacon Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra Cc: Catalin Marinas Cc: Russell King Cc: Linus Torvalds Cc: Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6c9e7c8735bf..a24d6d5d83f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1254,6 +1254,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } -- cgit v1.2.3 From e05bd3367bd3d88715b53766f95bb3a8ec7ab59e Mon Sep 17 00:00:00 2001 From: Pavan Naregundi Date: Tue, 29 Jun 2010 15:05:28 -0700 Subject: kexec: fix Oops in crash_shrink_memory() When crashkernel is not enabled, "echo 0 > /sys/kernel/kexec_crash_size" OOPSes the kernel in crash_shrink_memory. This happens when crash_shrink_memory tries to release the 'crashk_res' resource which are not reserved. Also value of "/sys/kernel/kexec_crash_size" shows as 1, which should be 0. This patch fixes the OOPS in crash_shrink_memory and shows "/sys/kernel/kexec_crash_size" as 0 when crash kernel memory is not reserved. Signed-off-by: Pavan Naregundi Reviewed-by: WANG Cong Cc: Simon Horman Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 474a84715eac..131b1703936f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs) size_t crash_get_memory_size(void) { - size_t size; + size_t size = 0; mutex_lock(&kexec_mutex); - size = crashk_res.end - crashk_res.start + 1; + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; mutex_unlock(&kexec_mutex); return size; } @@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) + if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); crashk_res.end = end - 1; -- cgit v1.2.3 From 7a0ea09ad5352efce8fe79ed853150449903b9f5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 30 Jun 2010 09:51:19 +0200 Subject: futex: futex_find_get_task remove credentails check futex_find_get_task is currently used (through lookup_pi_state) from two contexts, futex_requeue and futex_lock_pi_atomic. None of the paths looks it needs the credentials check, though. Different (e)uids shouldn't matter at all because the only thing that is important for shared futex is the accessibility of the shared memory. The credentail check results in glibc assert failure or process hang (if glibc is compiled without assert support) for shared robust pthread mutex with priority inheritance if a process tries to lock already held lock owned by a process with a different euid: pthread_mutex_lock.c:312: __pthread_mutex_lock_full: Assertion `(-(e)) != 3 || !robust' failed. The problem is that futex_lock_pi_atomic which is called when we try to lock already held lock checks the current holder (tid is stored in the futex value) to get the PI state. It uses lookup_pi_state which in turn gets task struct from futex_find_get_task. ESRCH is returned either when the task is not found or if credentials check fails. futex_lock_pi_atomic simply returns if it gets ESRCH. glibc code, however, doesn't expect that robust lock returns with ESRCH because it should get either success or owner died. Signed-off-by: Michal Hocko Acked-by: Darren Hart Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Nick Piggin Cc: Alexey Kuznetsov Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/futex.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f1039e7..6a3a5fa1526d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p) { - p = ERR_PTR(-ESRCH); - } else { - pcred = __task_cred(p); - if (cred->euid != pcred->euid && - cred->euid != pcred->uid) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - } + if (p) + get_task_struct(p); rcu_read_unlock(); @@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); + if (!p) + return -ESRCH; /* * We need to look at the task state flags to figure out, -- cgit v1.2.3 From 8c215bd3890c347dfb6a2db4779755f8b9c298a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Jul 2010 09:07:17 +0200 Subject: sched: Cure nr_iowait_cpu() users Commit 0224cf4c5e (sched: Intoduce get_cpu_iowait_time_us()) broke things by not making sure preemption was indeed disabled by the callers of nr_iowait_cpu() which took the iowait value of the current cpu. This resulted in a heap of preempt warnings. Cure this by making nr_iowait_cpu() take a cpu number and fix up the callers to pass in the right number. Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Cc: Sergey Senozhatsky Cc: Rafael J. Wysocki Cc: Maxim Levitsky Cc: Len Brown Cc: Pavel Machek Cc: Jiri Slaby Cc: linux-pm@lists.linux-foundation.org LKML-Reference: <1277968037.1868.120.camel@laptop> Signed-off-by: Ingo Molnar --- drivers/cpuidle/governors/menu.c | 4 ++-- include/linux/sched.h | 2 +- kernel/sched.c | 4 ++-- kernel/time/tick-sched.c | 16 ++++++++-------- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 52ff8aa63f84..1b128702d300 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -143,7 +143,7 @@ static inline int which_bucket(unsigned int duration) * This allows us to calculate * E(duration)|iowait */ - if (nr_iowait_cpu()) + if (nr_iowait_cpu(smp_processor_id())) bucket = BUCKETS/2; if (duration < 10) @@ -175,7 +175,7 @@ static inline int performance_multiplier(void) mult += 2 * get_loadavg(); /* for IO wait tasks (per cpu!) we add 5x each */ - mult += 10 * nr_iowait_cpu(); + mult += 10 * nr_iowait_cpu(smp_processor_id()); return mult; } diff --git a/include/linux/sched.h b/include/linux/sched.h index f118809c953f..747fcaedddb7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -139,7 +139,7 @@ extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); -extern unsigned long nr_iowait_cpu(void); +extern unsigned long nr_iowait_cpu(int cpu); extern unsigned long this_cpu_load(void); diff --git a/kernel/sched.c b/kernel/sched.c index a24d6d5d83f6..f87abe3b0176 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2864,9 +2864,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c034..1a6f828e57a0 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now) * Updates the per cpu time idle statistics counters */ static void -update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - if (nr_iowait_cpu() > 0) + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } @@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { ktime_t now; now = ktime_get(); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_entrytime = now; ts->idle_active = 1; @@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } @@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->iowait_sleeptime); } @@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates -- cgit v1.2.3 From ff49d74ad383f54041378144ca1a229ee9aeaa59 Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Sat, 3 Jul 2010 13:07:35 +1000 Subject: module: initialize module dynamic debug later We should initialize the module dynamic debug datastructures only after determining that the module is not loaded yet. This fixes a bug that introduced in 2.6.35-rc2, where when a trying to load a module twice, we also load it's dynamic printing data twice which causes all sorts of nasty issues. Also handle the dynamic debug cleanup later on failure. Signed-off-by: Yehuda Sadeh Signed-off-by: Rusty Russell (removed a #ifdef) Signed-off-by: Linus Torvalds --- include/linux/dynamic_debug.h | 4 ++-- kernel/module.c | 23 +++++++++++++++-------- lib/dynamic_debug.c | 2 +- 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index b3cd4de9432b..52c0da4bdd18 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -40,7 +40,7 @@ int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); #if defined(CONFIG_DYNAMIC_DEBUG) -extern int ddebug_remove_module(char *mod_name); +extern int ddebug_remove_module(const char *mod_name); #define __dynamic_dbg_enabled(dd) ({ \ int __ret = 0; \ @@ -73,7 +73,7 @@ extern int ddebug_remove_module(char *mod_name); #else -static inline int ddebug_remove_module(char *mod) +static inline int ddebug_remove_module(const char *mod) { return 0; } diff --git a/kernel/module.c b/kernel/module.c index 8c6b42840dd1..5d2d28197c82 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2062,6 +2062,12 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) #endif } +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); @@ -2124,6 +2130,8 @@ static noinline struct module *load_module(void __user *umod, void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; void __percpu *percpu; + struct _ddebug *debug = NULL; + unsigned int num_debug = 0; mm_segment_t old_fs; @@ -2476,15 +2484,9 @@ static noinline struct module *load_module(void __user *umod, kfree(strmap); strmap = NULL; - if (!mod->taints) { - struct _ddebug *debug; - unsigned int num_debug; - + if (!mod->taints) debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - if (debug) - dynamic_debug_setup(debug, num_debug); - } err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2526,10 +2528,13 @@ static noinline struct module *load_module(void __user *umod, goto unlock; } + if (debug) + dynamic_debug_setup(debug, num_debug); + /* Find duplicate symbols */ err = verify_export_symbols(mod); if (err < 0) - goto unlock; + goto ddebug; list_add_rcu(&mod->list, &modules); mutex_unlock(&module_mutex); @@ -2557,6 +2562,8 @@ static noinline struct module *load_module(void __user *umod, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + ddebug: + dynamic_debug_remove(debug); unlock: mutex_unlock(&module_mutex); synchronize_sched(); diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 3df8eb17a607..02afc2533728 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -692,7 +692,7 @@ static void ddebug_table_free(struct ddebug_table *dt) * Called in response to a module being unloaded. Removes * any ddebug_table's which point at the module. */ -int ddebug_remove_module(char *mod_name) +int ddebug_remove_module(const char *mod_name) { struct ddebug_table *dt, *nextdt; int ret = -ENOENT; -- cgit v1.2.3 From 9078370c0d2cfe4a905aa34f398bbb0d65921a2b Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 19 Jul 2010 11:54:15 +0100 Subject: kmemleak: Add support for NO_BOOTMEM configurations With commits 08677214 and 59be5a8e, alloc_bootmem()/free_bootmem() and friends use the early_res functions for memory management when NO_BOOTMEM is enabled. This patch adds the kmemleak calls in the corresponding code paths for bootmem allocations. Signed-off-by: Catalin Marinas Acked-by: Pekka Enberg Acked-by: Yinghai Lu Cc: H. Peter Anvin Cc: stable@kernel.org --- kernel/early_res.c | 6 ++++++ mm/page_alloc.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/early_res.c b/kernel/early_res.c index 31aa9332ef3f..7bfae887f211 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + if (start == end) return; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 431214b941ac..68319dd20bed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3659,6 +3659,11 @@ void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, ptr = phys_to_virt(addr); memset(ptr, 0, size); reserve_early_without_check(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); return ptr; } -- cgit v1.2.3