From 9a0f05cb36888550d1509d60aa55788615abea44 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 15:13:29 +0100 Subject: perf: Update the mmap control page on mmap() Apparently we didn't update the mmap control page right after mmap(), which leads to surprises when userspace wants to use it. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-dcpi7164djsexmx6ya7lilrc@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2f8f3f103cb4..0ca1f648ac08 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3534,6 +3534,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) event->mmap_user = get_current_user(); vma->vm_mm->pinned_vm += event->mmap_locked; + perf_event_update_userpage(event); + unlock: if (!ret) atomic_inc(&event->mmap_count); -- cgit v1.2.3 From 35edc2a5095efb189e60dc32bbb9d2663aec6d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 20:36:02 +0100 Subject: perf, arch: Rework perf_event_index() Put the logic to compute the event index into a per pmu method. This is required because the x86 rules are weird and wonderful and don't match the capabilities of the current scheme. AFAIK only powerpc actually has a usable userspace read of the PMCs but I'm not at all sure anybody actually used that. ARM is restored to the default since it currently does not support userspace access at all. And all software events are provided with a method that reports their index as 0 (disabled). Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Richard Kuo Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-dfydxodki16lylkt3gl2j7cw@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/include/asm/perf_event.h | 4 ---- arch/frv/include/asm/perf_event.h | 2 -- arch/hexagon/include/asm/perf_event.h | 2 -- arch/powerpc/include/asm/perf_event_server.h | 2 -- arch/powerpc/kernel/perf_event.c | 6 ++++++ arch/s390/include/asm/perf_event.h | 1 - arch/x86/include/asm/perf_event.h | 2 -- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 27 ++++++++++++++++++++++----- kernel/events/hw_breakpoint.c | 7 +++++++ 10 files changed, 41 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/perf_event.h b/arch/arm/include/asm/perf_event.h index 0f8e3827a89b..08f94d8fc04c 100644 --- a/arch/arm/include/asm/perf_event.h +++ b/arch/arm/include/asm/perf_event.h @@ -12,10 +12,6 @@ #ifndef __ARM_PERF_EVENT_H__ #define __ARM_PERF_EVENT_H__ -/* ARM performance counters start from 1 (in the cp15 accesses) so use the - * same indexes here for consistency. */ -#define PERF_EVENT_INDEX_OFFSET 1 - /* ARM perf PMU IDs for use by internal perf clients. */ enum arm_perf_pmu_ids { ARM_PERF_PMU_ID_XSCALE1 = 0, diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h index a69e0155d146..c52ea5546b5b 100644 --- a/arch/frv/include/asm/perf_event.h +++ b/arch/frv/include/asm/perf_event.h @@ -12,6 +12,4 @@ #ifndef _ASM_PERF_EVENT_H #define _ASM_PERF_EVENT_H -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/hexagon/include/asm/perf_event.h b/arch/hexagon/include/asm/perf_event.h index 6c2910f91180..8b8526b491c7 100644 --- a/arch/hexagon/include/asm/perf_event.h +++ b/arch/hexagon/include/asm/perf_event.h @@ -19,6 +19,4 @@ #ifndef _ASM_PERF_EVENT_H #define _ASM_PERF_EVENT_H -#define PERF_EVENT_INDEX_OFFSET 0 - #endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 8f1df1208d23..1a8093fa8f71 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -61,8 +61,6 @@ struct pt_regs; extern unsigned long perf_misc_flags(struct pt_regs *regs); extern unsigned long perf_instruction_pointer(struct pt_regs *regs); -#define PERF_EVENT_INDEX_OFFSET 1 - /* * Only override the default definitions in include/linux/perf_event.h * if we have hardware PMU support. diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index 10a140f82cb8..d614ab57ccca 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1187,6 +1187,11 @@ static int power_pmu_event_init(struct perf_event *event) return err; } +static int power_pmu_event_idx(struct perf_event *event) +{ + return event->hw.idx; +} + struct pmu power_pmu = { .pmu_enable = power_pmu_enable, .pmu_disable = power_pmu_disable, @@ -1199,6 +1204,7 @@ struct pmu power_pmu = { .start_txn = power_pmu_start_txn, .cancel_txn = power_pmu_cancel_txn, .commit_txn = power_pmu_commit_txn, + .event_idx = power_pmu_event_idx, }; /* diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index a75f168d2718..4eb444edbe49 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -6,4 +6,3 @@ /* Empty, just to avoid compiling error */ -#define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 096c975e099f..9b922c136254 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -188,8 +188,6 @@ extern u32 get_ibs_caps(void); #ifdef CONFIG_PERF_EVENTS extern void perf_events_lapic_init(void); -#define PERF_EVENT_INDEX_OFFSET 0 - /* * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. * This flag is otherwise unused and ABI specified to be 0, so nobody should diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 08855613ceb3..02545e6df95b 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -680,6 +680,12 @@ struct pmu { * for each successful ->add() during the transaction. */ void (*cancel_txn) (struct pmu *pmu); /* optional */ + + /* + * Will return the value for perf_event_mmap_page::index for this event, + * if no implementation is provided it will default to: event->hw.idx + 1. + */ + int (*event_idx) (struct perf_event *event); /*optional */ }; /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 0ca1f648ac08..3894309c41a2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3208,10 +3208,6 @@ int perf_event_task_disable(void) return 0; } -#ifndef PERF_EVENT_INDEX_OFFSET -# define PERF_EVENT_INDEX_OFFSET 0 -#endif - static int perf_event_index(struct perf_event *event) { if (event->hw.state & PERF_HES_STOPPED) @@ -3220,7 +3216,7 @@ static int perf_event_index(struct perf_event *event) if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; + return event->pmu->event_idx(event); } static void calc_timer_values(struct perf_event *event, @@ -4992,6 +4988,11 @@ static int perf_swevent_init(struct perf_event *event) return 0; } +static int perf_swevent_event_idx(struct perf_event *event) +{ + return 0; +} + static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, @@ -5001,6 +5002,8 @@ static struct pmu perf_swevent = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; #ifdef CONFIG_EVENT_TRACING @@ -5087,6 +5090,8 @@ static struct pmu perf_tracepoint = { .start = perf_swevent_start, .stop = perf_swevent_stop, .read = perf_swevent_read, + + .event_idx = perf_swevent_event_idx, }; static inline void perf_tp_register(void) @@ -5306,6 +5311,8 @@ static struct pmu perf_cpu_clock = { .start = cpu_clock_event_start, .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; /* @@ -5378,6 +5385,8 @@ static struct pmu perf_task_clock = { .start = task_clock_event_start, .stop = task_clock_event_stop, .read = task_clock_event_read, + + .event_idx = perf_swevent_event_idx, }; static void perf_pmu_nop_void(struct pmu *pmu) @@ -5405,6 +5414,11 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) perf_pmu_enable(pmu); } +static int perf_event_idx_default(struct perf_event *event) +{ + return event->hw.idx + 1; +} + /* * Ensures all contexts with the same task_ctx_nr have the same * pmu_cpu_context too. @@ -5594,6 +5608,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->event_idx) + pmu->event_idx = perf_event_idx_default; + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..b0309f76d777 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -613,6 +613,11 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) bp->hw.state = PERF_HES_STOPPED; } +static int hw_breakpoint_event_idx(struct perf_event *bp) +{ + return 0; +} + static struct pmu perf_breakpoint = { .task_ctx_nr = perf_sw_context, /* could eventually get its own */ @@ -622,6 +627,8 @@ static struct pmu perf_breakpoint = { .start = hw_breakpoint_start, .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, + + .event_idx = hw_breakpoint_event_idx, }; int __init init_hw_breakpoint(void) -- cgit v1.2.3 From 365a4038486b57bb2bd516706a80f82f250f5306 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 20:58:59 +0100 Subject: perf: Fix mmap_page::offset computation There's multiple reason the counter might be unavailable, change the condition to !->index since perf_event_index() should return 0 for all those cases. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-1ixr3olci40w8rgv2evv2ldh@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3894309c41a2..05affc3878ff 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3268,7 +3268,7 @@ void perf_event_update_userpage(struct perf_event *event) barrier(); userpg->index = perf_event_index(event); userpg->offset = perf_event_count(event); - if (event->state == PERF_EVENT_STATE_ACTIVE) + if (userpg->index) userpg->offset -= local64_read(&event->hw.prev_count); userpg->time_enabled = enabled + -- cgit v1.2.3 From 0c9d42ed4cee2aa1dfc3a260b741baae8615744f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 20 Nov 2011 23:30:47 +0100 Subject: perf, x86: Provide means for disabling userspace RDPMC Allow the disabling of RDPMC via a pmu specific attribute: echo 0 > /sys/bus/event_source/devices/cpu/rdpmc Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-pqeog465zo5hsimtkfz73f27@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 55 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/perf_event.h | 8 ++++++ include/linux/perf_event.h | 1 + kernel/events/core.c | 1 + 4 files changed, 64 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 53b569910175..116b040a73a8 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -1210,7 +1211,8 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) break; case CPU_STARTING: - set_in_cr4(X86_CR4_PCE); + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1320,6 +1322,8 @@ static int __init init_hw_perf_events(void) } } + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + pr_info("... version: %d\n", x86_pmu.version); pr_info("... bit width: %d\n", x86_pmu.cntval_bits); pr_info("... generic registers: %d\n", x86_pmu.num_counters); @@ -1555,10 +1559,59 @@ static int x86_pmu_event_idx(struct perf_event *event) return idx + 1; } +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} + +static void change_rdpmc(void *info) +{ + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); +} + +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul(buf, NULL, 0); + + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + smp_call_function(change_rdpmc, (void *)val, 1); + } + + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + NULL, +}; + static struct pmu pmu = { .pmu_enable = x86_pmu_enable, .pmu_disable = x86_pmu_disable, + .attr_groups = x86_pmu_attr_groups, + .event_init = x86_pmu_event_init, .add = x86_pmu_add, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 8944062f46e2..513d617b93c4 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -307,6 +307,14 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; + /* + * sysfs attrs + */ + int attr_rdpmc; + + /* + * CPU Hotplug hooks + */ int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 02545e6df95b..5311b79fe62c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -615,6 +615,7 @@ struct pmu { struct list_head entry; struct device *dev; + const struct attribute_group **attr_groups; char *name; int type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 05affc3878ff..dcd4049e92fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5505,6 +5505,7 @@ static int pmu_dev_alloc(struct pmu *pmu) if (!pmu->dev) goto out; + pmu->dev->groups = pmu->attr_groups; device_initialize(pmu->dev); ret = dev_set_name(pmu->dev, "%s", pmu->name); if (ret) -- cgit v1.2.3 From e3f3541c19c89a4daae39300defba68943301949 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Nov 2011 11:43:53 +0100 Subject: perf: Extend the mmap control page with time (TSC) fields Extend the mmap control page with fields so that userspace can compute time deltas relative to the provided time fields. Currently only implemented for x86 with constant and nonstop TSC. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Arun Sharma Link: http://lkml.kernel.org/n/tip-3u1jucza77j3wuvs0x2bic0f@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++++++++++ include/linux/perf_event.h | 4 +++- kernel/events/core.c | 21 ++++++++++++++------- 3 files changed, 31 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 116b040a73a8..f8bddb5b0600 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "perf_event.h" @@ -1627,6 +1628,19 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, }; +void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return; + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + return; + + userpg->time_mult = this_cpu_read(cyc2ns); + userpg->time_shift = CYC2NS_SCALE_FACTOR; + userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; +} + /* * callchain support */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5311b79fe62c..0b91db2522cc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -291,12 +291,14 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ + __u32 time_mult, time_shift; + __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[123]; /* align to 1k */ + __u64 __reserved[121]; /* align to 1k */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dcd4049e92fc..3a9c7d81afbf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3220,17 +3220,22 @@ static int perf_event_index(struct perf_event *event) } static void calc_timer_values(struct perf_event *event, + u64 *now, u64 *enabled, u64 *running) { - u64 now, ctx_time; + u64 ctx_time; - now = perf_clock(); - ctx_time = event->shadow_ctx_time + now; + *now = perf_clock(); + ctx_time = event->shadow_ctx_time + *now; *enabled = ctx_time - event->tstamp_enabled; *running = ctx_time - event->tstamp_running; } +void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +{ +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch @@ -3240,7 +3245,7 @@ void perf_event_update_userpage(struct perf_event *event) { struct perf_event_mmap_page *userpg; struct ring_buffer *rb; - u64 enabled, running; + u64 enabled, running, now; rcu_read_lock(); /* @@ -3252,7 +3257,7 @@ void perf_event_update_userpage(struct perf_event *event) * because of locking issue as we can be called in * NMI context */ - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); rb = rcu_dereference(event->rb); if (!rb) goto unlock; @@ -3277,6 +3282,8 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); + perf_update_user_clock(userpg, now); + barrier(); ++userpg->lock; preempt_enable(); @@ -3763,7 +3770,7 @@ static void perf_output_read_group(struct perf_output_handle *handle, static void perf_output_read(struct perf_output_handle *handle, struct perf_event *event) { - u64 enabled = 0, running = 0; + u64 enabled = 0, running = 0, now; u64 read_format = event->attr.read_format; /* @@ -3776,7 +3783,7 @@ static void perf_output_read(struct perf_output_handle *handle, * NMI context */ if (read_format & PERF_FORMAT_TOTAL_TIMES) - calc_timer_values(event, &enabled, &running); + calc_timer_values(event, &now, &enabled, &running); if (event->attr.read_format & PERF_FORMAT_GROUP) perf_output_read_group(handle, event, enabled, running); -- cgit v1.2.3 From 6c303d3ab39f0dc69546f179c424ee1124f50906 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Nov 2011 21:13:48 +0100 Subject: tracing: let trace_signal_generate() report more info, kill overflow_fail/lose_info __send_signal()->trace_signal_generate() doesn't report enough info. The users want to know was the signal actually delivered or not, and they also need the shared/private info. The patch moves trace_signal_generate() at the end of __send_signal() and adds the 2 additional arguments. This also allows us to kill trace_signal_overflow_fail/lose_info, we can simply add the appropriate TRACE_SIGNAL_ "result" codes. Reported-by: Seiji Aguchi Reviewed-by: Seiji Aguchi Acked-by: Steven Rostedt Signed-off-by: Oleg Nesterov --- include/trace/events/signal.h | 85 +++++++++++-------------------------------- kernel/signal.c | 22 +++++++---- 2 files changed, 36 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/signal.h b/include/trace/events/signal.h index 17df43464df0..39a8a430d90f 100644 --- a/include/trace/events/signal.h +++ b/include/trace/events/signal.h @@ -23,11 +23,23 @@ } \ } while (0) +#ifndef TRACE_HEADER_MULTI_READ +enum { + TRACE_SIGNAL_DELIVERED, + TRACE_SIGNAL_IGNORED, + TRACE_SIGNAL_ALREADY_PENDING, + TRACE_SIGNAL_OVERFLOW_FAIL, + TRACE_SIGNAL_LOSE_INFO, +}; +#endif + /** * signal_generate - called when a signal is generated * @sig: signal number * @info: pointer to struct siginfo * @task: pointer to struct task_struct + * @group: shared or private + * @result: TRACE_SIGNAL_* * * Current process sends a 'sig' signal to 'task' process with * 'info' siginfo. If 'info' is SEND_SIG_NOINFO or SEND_SIG_PRIV, @@ -37,9 +49,10 @@ */ TRACE_EVENT(signal_generate, - TP_PROTO(int sig, struct siginfo *info, struct task_struct *task), + TP_PROTO(int sig, struct siginfo *info, struct task_struct *task, + int group, int result), - TP_ARGS(sig, info, task), + TP_ARGS(sig, info, task, group, result), TP_STRUCT__entry( __field( int, sig ) @@ -47,6 +60,8 @@ TRACE_EVENT(signal_generate, __field( int, code ) __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) + __field( int, group ) + __field( int, result ) ), TP_fast_assign( @@ -54,11 +69,14 @@ TRACE_EVENT(signal_generate, TP_STORE_SIGINFO(__entry, info); memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->pid = task->pid; + __entry->group = group; + __entry->result = result; ), - TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d", + TP_printk("sig=%d errno=%d code=%d comm=%s pid=%d grp=%d res=%d", __entry->sig, __entry->errno, __entry->code, - __entry->comm, __entry->pid) + __entry->comm, __entry->pid, __entry->group, + __entry->result) ); /** @@ -101,65 +119,6 @@ TRACE_EVENT(signal_deliver, __entry->sa_handler, __entry->sa_flags) ); -DECLARE_EVENT_CLASS(signal_queue_overflow, - - TP_PROTO(int sig, int group, struct siginfo *info), - - TP_ARGS(sig, group, info), - - TP_STRUCT__entry( - __field( int, sig ) - __field( int, group ) - __field( int, errno ) - __field( int, code ) - ), - - TP_fast_assign( - __entry->sig = sig; - __entry->group = group; - TP_STORE_SIGINFO(__entry, info); - ), - - TP_printk("sig=%d group=%d errno=%d code=%d", - __entry->sig, __entry->group, __entry->errno, __entry->code) -); - -/** - * signal_overflow_fail - called when signal queue is overflow - * @sig: signal number - * @group: signal to process group or not (bool) - * @info: pointer to struct siginfo - * - * Kernel fails to generate 'sig' signal with 'info' siginfo, because - * siginfo queue is overflow, and the signal is dropped. - * 'group' is not 0 if the signal will be sent to a process group. - * 'sig' is always one of RT signals. - */ -DEFINE_EVENT(signal_queue_overflow, signal_overflow_fail, - - TP_PROTO(int sig, int group, struct siginfo *info), - - TP_ARGS(sig, group, info) -); - -/** - * signal_lose_info - called when siginfo is lost - * @sig: signal number - * @group: signal to process group or not (bool) - * @info: pointer to struct siginfo - * - * Kernel generates 'sig' signal but loses 'info' siginfo, because siginfo - * queue is overflow. - * 'group' is not 0 if the signal will be sent to a process group. - * 'sig' is always one of non-RT signals. - */ -DEFINE_EVENT(signal_queue_overflow, signal_lose_info, - - TP_PROTO(int sig, int group, struct siginfo *info), - - TP_ARGS(sig, group, info) -); - #endif /* _TRACE_SIGNAL_H */ /* This part must be outside protection */ diff --git a/kernel/signal.c b/kernel/signal.c index c73c4284160e..1bd9e86fda1f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1054,13 +1054,13 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, struct sigpending *pending; struct sigqueue *q; int override_rlimit; - - trace_signal_generate(sig, info, t); + int ret = 0, result; assert_spin_locked(&t->sighand->siglock); + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, from_ancestor_ns)) - return 0; + goto ret; pending = group ? &t->signal->shared_pending : &t->pending; /* @@ -1068,8 +1068,11 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * exactly one non-rt signal, so that we can get more * detailed information about the cause of the signal. */ + result = TRACE_SIGNAL_ALREADY_PENDING; if (legacy_queue(pending, sig)) - return 0; + goto ret; + + result = TRACE_SIGNAL_DELIVERED; /* * fast-pathed signals for kernel-internal things like SIGSTOP * or SIGKILL. @@ -1127,14 +1130,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, * signal was rt and sent by user using something * other than kill(). */ - trace_signal_overflow_fail(sig, group, info); - return -EAGAIN; + result = TRACE_SIGNAL_OVERFLOW_FAIL; + ret = -EAGAIN; + goto ret; } else { /* * This is a silent loss of information. We still * send the signal, but the *info bits are lost. */ - trace_signal_lose_info(sig, group, info); + result = TRACE_SIGNAL_LOSE_INFO; } } @@ -1142,7 +1146,9 @@ out_set: signalfd_notify(t, sig); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); - return 0; +ret: + trace_signal_generate(sig, info, t, group, result); + return ret; } static int send_signal(int sig, struct siginfo *info, struct task_struct *t, -- cgit v1.2.3 From 163566f60bfe6a8176650155e2d98649b0dfabf8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 22 Nov 2011 21:37:41 +0100 Subject: tracing: send_sigqueue() needs trace_signal_generate() too Add trace_signal_generate() into send_sigqueue(). send_sigqueue() is very similar to __send_signal(), just it uses the preallocated info. It should do the same wrt tracing. Reported-by: Seiji Aguchi Reviewed-by: Seiji Aguchi Acked-by: Steven Rostedt Signed-off-by: Oleg Nesterov --- kernel/signal.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 1bd9e86fda1f..8511e39813c7 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1591,7 +1591,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) int sig = q->info.si_signo; struct sigpending *pending; unsigned long flags; - int ret; + int ret, result; BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); @@ -1600,6 +1600,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) goto ret; ret = 1; /* the signal is ignored */ + result = TRACE_SIGNAL_IGNORED; if (!prepare_signal(sig, t, 0)) goto out; @@ -1611,6 +1612,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) */ BUG_ON(q->info.si_code != SI_TIMER); q->info.si_overrun++; + result = TRACE_SIGNAL_ALREADY_PENDING; goto out; } q->info.si_overrun = 0; @@ -1620,7 +1622,9 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) list_add_tail(&q->list, &pending->list); sigaddset(&pending->signal, sig); complete_signal(sig, t, group); + result = TRACE_SIGNAL_DELIVERED; out: + trace_signal_generate(sig, &q->info, t, group, result); unlock_task_sighand(t, &flags); ret: return ret; -- cgit v1.2.3 From 245282557c49754af3dbcc732316e814340d6bce Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 20 Jan 2012 11:58:43 +0800 Subject: cgroup: move struct cgroup_pidlist out from the header file It's internally used only. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 32 -------------------------------- kernel/cgroup.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index dee53bdb046d..7da3e745b74c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -160,38 +160,6 @@ enum { CGRP_CLONE_CHILDREN, }; -/* which pidlist file are we talking about? */ -enum cgroup_filetype { - CGROUP_FILE_PROCS, - CGROUP_FILE_TASKS, -}; - -/* - * A pidlist is a list of pids that virtually represents the contents of one - * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, - * a pair (one each for procs, tasks) for each pid namespace that's relevant - * to the cgroup. - */ -struct cgroup_pidlist { - /* - * used to find which pidlist is wanted. doesn't change as long as - * this particular list stays in the list. - */ - struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; - /* array of xids */ - pid_t *list; - /* how many elements the above list has */ - int length; - /* how many files are using the current array */ - int use_count; - /* each of these stored in a list by its cgroup */ - struct list_head links; - /* pointer to the cgroup we belong to, for list removal purposes */ - struct cgroup *owner; - /* protects the other fields */ - struct rw_semaphore mutex; -}; - struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..6ca7acad7c55 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3043,6 +3043,38 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * */ +/* which pidlist file are we talking about? */ +enum cgroup_filetype { + CGROUP_FILE_PROCS, + CGROUP_FILE_TASKS, +}; + +/* + * A pidlist is a list of pids that virtually represents the contents of one + * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, + * a pair (one each for procs, tasks) for each pid namespace that's relevant + * to the cgroup. + */ +struct cgroup_pidlist { + /* + * used to find which pidlist is wanted. doesn't change as long as + * this particular list stays in the list. + */ + struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; + /* array of xids */ + pid_t *list; + /* how many elements the above list has */ + int length; + /* how many files are using the current array */ + int use_count; + /* each of these stored in a list by its cgroup */ + struct list_head links; + /* pointer to the cgroup we belong to, for list removal purposes */ + struct cgroup *owner; + /* protects the other fields */ + struct rw_semaphore mutex; +}; + /* * The following two functions "fix" the issue where there are more pids * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. -- cgit v1.2.3 From b78949ebfb563c29808a9d0a772e3adb5561bc80 Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 3 Jan 2012 21:18:30 -0800 Subject: cgroup: simplify double-check locking in cgroup_attach_proc To keep the complexity of the double-check locking in one place, move the thread_group_leader check up into attach_task_by_pid(). This allows us to use a goto instead of returning -EAGAIN. While at it, convert a couple of returns to gotos and use rcu for the !pid case also in order to simplify the logic. Changes in V2: * https://lkml.org/lkml/2011/12/22/86 (Tejun Heo) * Use a goto instead of returning -EAGAIN Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Frederic Weisbecker Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 79 +++++++++++++++++++++------------------------------------ 1 file changed, 29 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ca7acad7c55..12c07e8fd69c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2104,19 +2104,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* prevent changes to the threadgroup list while we take a snapshot. */ read_lock(&tasklist_lock); - if (!thread_group_leader(leader)) { - /* - * a race with de_thread from another thread's exec() may strip - * us of our leadership, making while_each_thread unsafe to use - * on this task. if this happens, there is no choice but to - * throw this task away and try again (from cgroup_procs_write); - * this is "double-double-toil-and-trouble-check locking". - */ - read_unlock(&tasklist_lock); - retval = -EAGAIN; - goto out_free_group_list; - } - tsk = leader; i = 0; do { @@ -2245,22 +2232,14 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) if (!cgroup_lock_live_group(cgrp)) return -ENODEV; +retry_find_task: + rcu_read_lock(); if (pid) { - rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk) { rcu_read_unlock(); - cgroup_unlock(); - return -ESRCH; - } - if (threadgroup) { - /* - * RCU protects this access, since tsk was found in the - * tid map. a race with de_thread may cause group_leader - * to stop being the leader, but cgroup_attach_proc will - * detect it later. - */ - tsk = tsk->group_leader; + ret= -ESRCH; + goto out_unlock_cgroup; } /* * even if we're attaching all tasks in the thread group, we @@ -2271,29 +2250,38 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); - cgroup_unlock(); - return -EACCES; + ret = -EACCES; + goto out_unlock_cgroup; } - get_task_struct(tsk); - rcu_read_unlock(); - } else { - if (threadgroup) - tsk = current->group_leader; - else - tsk = current; - get_task_struct(tsk); - } - - threadgroup_lock(tsk); + } else + tsk = current; if (threadgroup) + tsk = tsk->group_leader; + get_task_struct(tsk); + rcu_read_unlock(); + + threadgroup_lock(tsk); + if (threadgroup) { + if (!thread_group_leader(tsk)) { + /* + * a race with de_thread from another thread's exec() + * may strip us of our leadership, if this happens, + * there is no choice but to throw this task away and + * try again; this is + * "double-double-toil-and-trouble-check locking". + */ + threadgroup_unlock(tsk); + put_task_struct(tsk); + goto retry_find_task; + } ret = cgroup_attach_proc(cgrp, tsk); - else + } else ret = cgroup_attach_task(cgrp, tsk); - threadgroup_unlock(tsk); put_task_struct(tsk); +out_unlock_cgroup: cgroup_unlock(); return ret; } @@ -2305,16 +2293,7 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) { - int ret; - do { - /* - * attach_proc fails with -EAGAIN if threadgroup leadership - * changes in the middle of the operation, in which case we need - * to find the task_struct for the new leader and start over. - */ - ret = attach_task_by_pid(cgrp, tgid, true); - } while (ret == -EAGAIN); - return ret; + return attach_task_by_pid(cgrp, tgid, true); } /** -- cgit v1.2.3 From fb5d2b4cfc24963d0e8a7df57de1ecffa10a04cf Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Tue, 3 Jan 2012 21:18:31 -0800 Subject: cgroup: replace tasklist_lock with rcu_read_lock We can replace the tasklist_lock in cgroup_attach_proc with an rcu_read_lock(). Changes in V4: * https://lkml.org/lkml/2011/12/23/284 (Frederic Weisbecker) * Minimize size of rcu_read_lock critical section * Add comment * https://lkml.org/lkml/2011/12/26/136 (Li Zefan) * Split into two patches Changes in V3: * https://lkml.org/lkml/2011/12/22/419 (Frederic Weisbecker) * Add an rcu_read_lock to protect against exit Changes in V2: * https://lkml.org/lkml/2011/12/22/86 (Tejun Heo) * Use a goto instead of returning -EAGAIN Suggested-by: Frederic Weisbecker Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Acked-by: Frederic Weisbecker Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 12c07e8fd69c..1626152dcc1e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2102,10 +2102,14 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) if (retval) goto out_free_group_list; - /* prevent changes to the threadgroup list while we take a snapshot. */ - read_lock(&tasklist_lock); tsk = leader; i = 0; + /* + * Prevent freeing of tasks while we take a snapshot. Tasks that are + * already PF_EXITING could be freed from underneath us unless we + * take an rcu_read_lock. + */ + rcu_read_lock(); do { struct task_and_cgroup ent; @@ -2128,11 +2132,11 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) BUG_ON(retval != 0); i++; } while_each_thread(leader, tsk); + rcu_read_unlock(); /* remember the number of threads in the array for later. */ group_size = i; tset.tc_array = group; tset.tc_array_len = group_size; - read_unlock(&tasklist_lock); /* methods shouldn't be called if no task is actually migrating */ retval = 0; -- cgit v1.2.3 From 0ce8974d504913a0f0ae2d97b20a5ac665431a41 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:13:27 -0800 Subject: sysctl: Consolidate !CONFIG_SYSCTL handling - In sysctl.h move functions only available if CONFIG_SYSCL is defined inside of #ifdef CONFIG_SYSCTL - Move the stub function definitions for !CONFIG_SYSCTL into sysctl.h and make them static inlines. Signed-off-by: Eric W. Biederman --- include/linux/sysctl.h | 95 ++++++++++++++++++++++++++++++++------------------ kernel/sysctl.c | 26 -------------- 2 files changed, 62 insertions(+), 59 deletions(-) (limited to 'kernel') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index bb9127dd814b..cf3ee7f246d6 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -937,30 +937,8 @@ enum struct ctl_table; struct nsproxy; struct ctl_table_root; - -struct ctl_table_set { - struct list_head list; - struct ctl_table_set *parent; - int (*is_seen)(struct ctl_table_set *); -}; - -extern void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)); - struct ctl_table_header; -extern void sysctl_head_get(struct ctl_table_header *); -extern void sysctl_head_put(struct ctl_table_header *); -extern int sysctl_is_seen(struct ctl_table_header *); -extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *); -extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev); -extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev); -extern void sysctl_head_finish(struct ctl_table_header *prev); -extern int sysctl_perm(struct ctl_table_root *root, - struct ctl_table *table, int op); - typedef struct ctl_table ctl_table; typedef int proc_handler (struct ctl_table *ctl, int write, @@ -1023,8 +1001,6 @@ static inline void *proc_sys_poll_event(struct ctl_table_poll *poll) return (void *)(unsigned long)atomic_read(&poll->event); } -void proc_sys_poll_notify(struct ctl_table_poll *poll); - #define __CTL_TABLE_POLL_INITIALIZER(name) { \ .event = ATOMIC_INIT(0), \ .wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.wait) } @@ -1047,15 +1023,6 @@ struct ctl_table void *extra2; }; -struct ctl_table_root { - struct list_head root_list; - struct ctl_table_set default_set; - struct ctl_table_set *(*lookup)(struct ctl_table_root *root, - struct nsproxy *namespaces); - int (*permissions)(struct ctl_table_root *root, - struct nsproxy *namespaces, struct ctl_table *table); -}; - /* struct ctl_table_header is used to maintain dynamic lists of struct ctl_table trees. */ struct ctl_table_header @@ -1078,11 +1045,45 @@ struct ctl_table_header struct ctl_table_header *parent; }; +struct ctl_table_set { + struct list_head list; + struct ctl_table_set *parent; + int (*is_seen)(struct ctl_table_set *); +}; + +struct ctl_table_root { + struct list_head root_list; + struct ctl_table_set default_set; + struct ctl_table_set *(*lookup)(struct ctl_table_root *root, + struct nsproxy *namespaces); + int (*permissions)(struct ctl_table_root *root, + struct nsproxy *namespaces, struct ctl_table *table); +}; + /* struct ctl_path describes where in the hierarchy a table is added */ struct ctl_path { const char *procname; }; +#ifdef CONFIG_SYSCTL + +void proc_sys_poll_notify(struct ctl_table_poll *poll); + +extern void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)); + +extern void sysctl_head_get(struct ctl_table_header *); +extern void sysctl_head_put(struct ctl_table_header *); +extern int sysctl_is_seen(struct ctl_table_header *); +extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *); +extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev); +extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, + struct ctl_table_header *prev); +extern void sysctl_head_finish(struct ctl_table_header *prev); +extern int sysctl_perm(struct ctl_table_root *root, + struct ctl_table *table, int op); + void register_sysctl_root(struct ctl_table_root *root); struct ctl_table_header *__register_sysctl_paths( struct ctl_table_root *root, struct nsproxy *namespaces, @@ -1094,6 +1095,34 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table); +#else /* CONFIG_SYSCTL */ +static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) +{ + return NULL; +} + +static inline struct ctl_table_header *register_sysctl_paths( + const struct ctl_path *path, struct ctl_table *table) +{ + return NULL; +} + +static inline void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +static inline void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ +} + +static inline void sysctl_head_put(struct ctl_table_header *head) +{ +} + +#endif /* CONFIG_SYSCTL */ + #endif /* __KERNEL__ */ #endif /* _LINUX_SYSCTL_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..d5bbddd0de24 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2017,32 +2017,6 @@ void setup_sysctl_set(struct ctl_table_set *p, p->is_seen = is_seen; } -#else /* !CONFIG_SYSCTL */ -struct ctl_table_header *register_sysctl_table(struct ctl_table * table) -{ - return NULL; -} - -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return NULL; -} - -void unregister_sysctl_table(struct ctl_table_header * table) -{ -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ -} - -void sysctl_head_put(struct ctl_table_header *head) -{ -} - #endif /* CONFIG_SYSCTL */ /* -- cgit v1.2.3 From de4e83bd6b5e16d491ec068cd22801d5d063b07a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 03:34:20 -0800 Subject: sysctl: Register the base sysctl table like any other sysctl table. Simplify the code by treating the base sysctl table like any other sysctl table and register it with register_sysctl_table. To ensure this table is registered early enough to avoid problems call sysctl_init from proc_sys_init. Rename sysctl_net.c:sysctl_init() to net_sysctl_init() to avoid name conflicts now that kernel/sysctl.c:sysctl_init() is no longer static. Signed-off-by: Eric W. Biederman --- fs/proc/proc_sysctl.c | 3 ++- include/linux/sysctl.h | 1 + kernel/sysctl.c | 13 ++++--------- net/sysctl_net.c | 4 ++-- 4 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d82f4a8b4b80..9d29d28af577 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -468,5 +468,6 @@ int __init proc_sys_init(void) proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; proc_sys_root->nlink = 0; - return 0; + + return sysctl_init(); } diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index cf3ee7f246d6..5e3532e9599f 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -1095,6 +1095,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table); +extern int sysctl_init(void); #else /* CONFIG_SYSCTL */ static inline struct ctl_table_header *register_sysctl_table(struct ctl_table * table) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d5bbddd0de24..ad460248acc7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,7 +192,7 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[]; +static struct ctl_table root_table[1]; static struct ctl_table_root sysctl_table_root; static struct ctl_table_header root_table_header = { {{.count = 1, @@ -222,7 +222,7 @@ int sysctl_legacy_va_layout; /* The default sysctl tables: */ -static struct ctl_table root_table[] = { +static struct ctl_table sysctl_base_table[] = { { .procname = "kernel", .mode = 0555, @@ -1747,17 +1747,12 @@ static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) } } -static __init int sysctl_init(void) +int __init sysctl_init(void) { - sysctl_set_parent(NULL, root_table); -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - sysctl_check_table(current->nsproxy, root_table); -#endif + register_sysctl_table(sysctl_base_table); return 0; } -core_initcall(sysctl_init); - static struct ctl_table *is_branch_in(struct ctl_table *branch, struct ctl_table *table) { diff --git a/net/sysctl_net.c b/net/sysctl_net.c index e75813904f26..a6bbee2bc710 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -90,7 +90,7 @@ static struct pernet_operations sysctl_pernet_ops = { .exit = sysctl_net_exit, }; -static __init int sysctl_init(void) +static __init int net_sysctl_init(void) { int ret; ret = register_pernet_subsys(&sysctl_pernet_ops); @@ -102,7 +102,7 @@ static __init int sysctl_init(void) out: return ret; } -subsys_initcall(sysctl_init); +subsys_initcall(net_sysctl_init); struct ctl_table_header *register_net_sysctl_table(struct net *net, const struct ctl_path *path, struct ctl_table *table) -- cgit v1.2.3 From 1f87f0b52b1d6581168cb80f86746bc4df918d01 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 6 Jan 2012 04:07:15 -0800 Subject: sysctl: Move the implementation into fs/proc/proc_sysctl.c Move the core sysctl code from kernel/sysctl.c and kernel/sysctl_check.c into fs/proc/proc_sysctl.c. Currently sysctl maintenance is hampered by the sysctl implementation being split across 3 files with artificial layering between them. Consolidate the entire sysctl implementation into 1 file so that it is easier to see what is going on and hopefully allowing for simpler maintenance. For functions that are now only used in fs/proc/proc_sysctl.c remove their declarations from sysctl.h and make them static in fs/proc/proc_sysctl.c Signed-off-by: Eric W. Biederman --- fs/proc/internal.h | 3 + fs/proc/proc_sysctl.c | 622 +++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/sysctl.h | 16 -- kernel/Makefile | 1 - kernel/sysctl.c | 464 ------------------------------------ kernel/sysctl_check.c | 160 ------------- 6 files changed, 625 insertions(+), 641 deletions(-) delete mode 100644 kernel/sysctl_check.c (limited to 'kernel') diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 292577531ad1..3b5ecd960d6a 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -10,12 +10,15 @@ */ #include +struct ctl_table_header; extern struct proc_dir_entry proc_root; #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); +extern void sysctl_head_put(struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } +static inline void sysctl_head_put(struct ctl_table_header *head) { } #endif #ifdef CONFIG_NET extern int proc_net_init(void); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 9d29d28af577..06e6f10ee8ec 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -24,6 +25,209 @@ void proc_sys_poll_notify(struct ctl_table_poll *poll) wake_up_interruptible(&poll->wait); } +static struct ctl_table root_table[1]; +static struct ctl_table_root sysctl_table_root; +static struct ctl_table_header root_table_header = { + {{.count = 1, + .ctl_table = root_table, + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, +}; +static struct ctl_table_root sysctl_table_root = { + .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), + .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + list_del_init(&p->ctl_entry); +} + +static void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + kfree_rcu(head, rcu); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +static void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct list_head * +lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = lookup_header_set(root, namespaces); + return &set->list; +} + +static struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, + struct ctl_table_header *prev) +{ + struct ctl_table_root *root; + struct list_head *header_list; + struct ctl_table_header *head; + struct list_head *tmp; + + spin_lock(&sysctl_lock); + if (prev) { + head = prev; + tmp = &prev->ctl_entry; + unuse_table(prev); + goto next; + } + tmp = &root_table_header.ctl_entry; + for (;;) { + head = list_entry(tmp, struct ctl_table_header, ctl_entry); + + if (!use_table(head)) + goto next; + spin_unlock(&sysctl_lock); + return head; + next: + root = head->root; + tmp = tmp->next; + header_list = lookup_header_list(root, namespaces); + if (tmp != header_list) + continue; + + do { + root = list_entry(root->root_list.next, + struct ctl_table_root, root_list); + if (root == &sysctl_table_root) + goto out; + header_list = lookup_header_list(root, namespaces); + } while (list_empty(header_list)); + tmp = header_list->next; + } +out: + spin_unlock(&sysctl_lock); + return NULL; +} + +static struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) +{ + return __sysctl_head_next(current->nsproxy, prev); +} + +void register_sysctl_root(struct ctl_table_root *root) +{ + spin_lock(&sysctl_lock); + list_add_tail(&root->root_list, &sysctl_table_root.root_list); + spin_unlock(&sysctl_lock); +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current_euid()) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +{ + int mode; + + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + +static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) +{ + for (; table->procname; table++) { + table->parent = parent; + if (table->child) + sysctl_set_parent(table, table->child); + } +} + + static struct inode *proc_sys_make_inode(struct super_block *sb, struct ctl_table_header *head, struct ctl_table *table) { @@ -435,6 +639,21 @@ static int proc_sys_delete(const struct dentry *dentry) return !!PROC_I(dentry->d_inode)->sysctl->unregistering; } +static int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + static int proc_sys_compare(const struct dentry *parent, const struct inode *pinode, const struct dentry *dentry, const struct inode *inode, @@ -460,6 +679,409 @@ static const struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; +static struct ctl_table *is_branch_in(struct ctl_table *branch, + struct ctl_table *table) +{ + struct ctl_table *p; + const char *s = branch->procname; + + /* branch should have named subdirectory as its first element */ + if (!s || !branch->child) + return NULL; + + /* ... and nothing else */ + if (branch[1].procname) + return NULL; + + /* table should contain subdirectory with the same name */ + for (p = table; p->procname; p++) { + if (!p->child) + continue; + if (p->procname && strcmp(p->procname, s) == 0) + return p; + } + return NULL; +} + +/* see if attaching q to p would be an improvement */ +static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) +{ + struct ctl_table *to = p->ctl_table, *by = q->ctl_table; + struct ctl_table *next; + int is_better = 0; + int not_in_parent = !p->attached_by; + + while ((next = is_branch_in(by, to)) != NULL) { + if (by == q->attached_by) + is_better = 1; + if (to == p->attached_by) + not_in_parent = 1; + by = by->child; + to = next->child; + } + + if (is_better && not_in_parent) { + q->attached_by = by; + q->attached_to = to; + q->parent = p; + } +} + +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK +static int sysctl_depth(struct ctl_table *table) +{ + struct ctl_table *tmp; + int depth; + + depth = 0; + for (tmp = table; tmp->parent; tmp = tmp->parent) + depth++; + + return depth; +} + +static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) +{ + int i; + + for (i = 0; table && i < n; i++) + table = table->parent; + + return table; +} + + +static void sysctl_print_path(struct ctl_table *table) +{ + struct ctl_table *tmp; + int depth, i; + depth = sysctl_depth(table); + if (table->procname) { + for (i = depth; i >= 0; i--) { + tmp = sysctl_parent(table, i); + printk("/%s", tmp->procname?tmp->procname:""); + } + } + printk(" "); +} + +static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, + struct ctl_table *table) +{ + struct ctl_table_header *head; + struct ctl_table *ref, *test; + int depth, cur_depth; + + depth = sysctl_depth(table); + + for (head = __sysctl_head_next(namespaces, NULL); head; + head = __sysctl_head_next(namespaces, head)) { + cur_depth = depth; + ref = head->ctl_table; +repeat: + test = sysctl_parent(table, cur_depth); + for (; ref->procname; ref++) { + int match = 0; + if (cur_depth && !ref->child) + continue; + + if (test->procname && ref->procname && + (strcmp(test->procname, ref->procname) == 0)) + match++; + + if (match) { + if (cur_depth != 0) { + cur_depth--; + ref = ref->child; + goto repeat; + } + goto out; + } + } + } + ref = NULL; +out: + sysctl_head_finish(head); + return ref; +} + +static void set_fail(const char **fail, struct ctl_table *table, const char *str) +{ + if (*fail) { + printk(KERN_ERR "sysctl table check failed: "); + sysctl_print_path(table); + printk(" %s\n", *fail); + dump_stack(); + } + *fail = str; +} + +static void sysctl_check_leaf(struct nsproxy *namespaces, + struct ctl_table *table, const char **fail) +{ + struct ctl_table *ref; + + ref = sysctl_check_lookup(namespaces, table); + if (ref && (ref != table)) + set_fail(fail, table, "Sysctl already exists"); +} + +static int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) +{ + int error = 0; + for (; table->procname; table++) { + const char *fail = NULL; + + if (table->parent) { + if (!table->parent->procname) + set_fail(&fail, table, "Parent without procname"); + } + if (table->child) { + if (table->data) + set_fail(&fail, table, "Directory with data?"); + if (table->maxlen) + set_fail(&fail, table, "Directory with maxlen?"); + if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) + set_fail(&fail, table, "Writable sysctl directory"); + if (table->proc_handler) + set_fail(&fail, table, "Directory with proc_handler"); + if (table->extra1) + set_fail(&fail, table, "Directory with extra1"); + if (table->extra2) + set_fail(&fail, table, "Directory with extra2"); + } else { + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + set_fail(&fail, table, "No data"); + if (!table->maxlen) + set_fail(&fail, table, "No maxlen"); + } +#ifdef CONFIG_PROC_SYSCTL + if (!table->proc_handler) + set_fail(&fail, table, "No proc_handler"); +#endif + sysctl_check_leaf(namespaces, table, &fail); + } + if (table->mode > 0777) + set_fail(&fail, table, "bogus .mode"); + if (fail) { + set_fail(&fail, table, NULL); + error = -EINVAL; + } + if (table->child) + error |= sysctl_check_table(namespaces, table->child); + } + return error; +} +#endif /* CONFIG_SYSCTL_SYSCALL_CHECK */ + +/** + * __register_sysctl_paths - register a sysctl hierarchy + * @root: List of sysctl headers to register on + * @namespaces: Data to compute which lists of sysctl entries are visible + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_root *root, + struct nsproxy *namespaces, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table_header *header; + struct ctl_table *new, **prevp; + unsigned int n, npath; + struct ctl_table_set *set; + + /* Count the path components */ + for (npath = 0; path[npath].procname; ++npath) + ; + + /* + * For each path component, allocate a 2-element ctl_table array. + * The first array element will be filled with the sysctl entry + * for this, the second will be the sentinel (procname == 0). + * + * We allocate everything in one go so that we don't have to + * worry about freeing additional memory in unregister_sysctl_table. + */ + header = kzalloc(sizeof(struct ctl_table_header) + + (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); + if (!header) + return NULL; + + new = (struct ctl_table *) (header + 1); + + /* Now connect the dots */ + prevp = &header->ctl_table; + for (n = 0; n < npath; ++n, ++path) { + /* Copy the procname */ + new->procname = path->procname; + new->mode = 0555; + + *prevp = new; + prevp = &new->child; + + new += 2; + } + *prevp = table; + header->ctl_table_arg = table; + + INIT_LIST_HEAD(&header->ctl_entry); + header->used = 0; + header->unregistering = NULL; + header->root = root; + sysctl_set_parent(NULL, header->ctl_table); + header->count = 1; +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK + if (sysctl_check_table(namespaces, header->ctl_table)) { + kfree(header); + return NULL; + } +#endif + spin_lock(&sysctl_lock); + header->set = lookup_header_set(root, namespaces); + header->attached_by = header->ctl_table; + header->attached_to = root_table; + header->parent = &root_table_header; + for (set = header->set; set; set = set->parent) { + struct ctl_table_header *p; + list_for_each_entry(p, &set->list, ctl_entry) { + if (p->unregistering) + continue; + try_attach(p, header); + } + } + header->parent->count++; + list_add_tail(&header->ctl_entry, &header->set->list); + spin_unlock(&sysctl_lock); + + return header; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, + path, table); +} +EXPORT_SYMBOL(register_sysctl_paths); + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} +EXPORT_SYMBOL(register_sysctl_table); + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + might_sleep(); + + if (header == NULL) + return; + + spin_lock(&sysctl_lock); + start_unregistering(header); + if (!--header->parent->count) { + WARN_ON(1); + kfree_rcu(header->parent, rcu); + } + if (!--header->count) + kfree_rcu(header, rcu); + spin_unlock(&sysctl_lock); +} +EXPORT_SYMBOL(unregister_sysctl_table); + +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ + INIT_LIST_HEAD(&p->list); + p->parent = parent ? parent : &sysctl_table_root.default_set; + p->is_seen = is_seen; +} + + int __init proc_sys_init(void) { struct proc_dir_entry *proc_sys_root; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 5e3532e9599f..08cabbfddacb 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -1073,17 +1073,6 @@ extern void setup_sysctl_set(struct ctl_table_set *p, struct ctl_table_set *parent, int (*is_seen)(struct ctl_table_set *)); -extern void sysctl_head_get(struct ctl_table_header *); -extern void sysctl_head_put(struct ctl_table_header *); -extern int sysctl_is_seen(struct ctl_table_header *); -extern struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *); -extern struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev); -extern struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev); -extern void sysctl_head_finish(struct ctl_table_header *prev); -extern int sysctl_perm(struct ctl_table_root *root, - struct ctl_table *table, int op); - void register_sysctl_root(struct ctl_table_root *root); struct ctl_table_header *__register_sysctl_paths( struct ctl_table_root *root, struct nsproxy *namespaces, @@ -1093,7 +1082,6 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, struct ctl_table *table); void unregister_sysctl_table(struct ctl_table_header * table); -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table); extern int sysctl_init(void); #else /* CONFIG_SYSCTL */ @@ -1118,10 +1106,6 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, { } -static inline void sysctl_head_put(struct ctl_table_header *head) -{ -} - #endif /* CONFIG_SYSCTL */ #endif /* __KERNEL__ */ diff --git a/kernel/Makefile b/kernel/Makefile index 2d9de86b7e76..cb41b9547c9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,7 +27,6 @@ obj-y += power/ obj-$(CONFIG_FREEZER) += freezer.o obj-$(CONFIG_PROFILING) += profile.o -obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ad460248acc7..b774909ed46c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -192,20 +192,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write, #endif -static struct ctl_table root_table[1]; -static struct ctl_table_root sysctl_table_root; -static struct ctl_table_header root_table_header = { - {{.count = 1, - .ctl_table = root_table, - .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, - .root = &sysctl_table_root, - .set = &sysctl_table_root.default_set, -}; -static struct ctl_table_root sysctl_table_root = { - .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), - .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), -}; - static struct ctl_table kern_table[]; static struct ctl_table vm_table[]; static struct ctl_table fs_table[]; @@ -1559,459 +1545,12 @@ static struct ctl_table dev_table[] = { { } }; -static DEFINE_SPINLOCK(sysctl_lock); - -/* called under sysctl_lock */ -static int use_table(struct ctl_table_header *p) -{ - if (unlikely(p->unregistering)) - return 0; - p->used++; - return 1; -} - -/* called under sysctl_lock */ -static void unuse_table(struct ctl_table_header *p) -{ - if (!--p->used) - if (unlikely(p->unregistering)) - complete(p->unregistering); -} - -/* called under sysctl_lock, will reacquire if has to wait */ -static void start_unregistering(struct ctl_table_header *p) -{ - /* - * if p->used is 0, nobody will ever touch that entry again; - * we'll eliminate all paths to it before dropping sysctl_lock - */ - if (unlikely(p->used)) { - struct completion wait; - init_completion(&wait); - p->unregistering = &wait; - spin_unlock(&sysctl_lock); - wait_for_completion(&wait); - spin_lock(&sysctl_lock); - } else { - /* anything non-NULL; we'll never dereference it */ - p->unregistering = ERR_PTR(-EINVAL); - } - /* - * do not remove from the list until nobody holds it; walking the - * list in do_sysctl() relies on that. - */ - list_del_init(&p->ctl_entry); -} - -void sysctl_head_get(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - head->count++; - spin_unlock(&sysctl_lock); -} - -void sysctl_head_put(struct ctl_table_header *head) -{ - spin_lock(&sysctl_lock); - if (!--head->count) - kfree_rcu(head, rcu); - spin_unlock(&sysctl_lock); -} - -struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) -{ - if (!head) - BUG(); - spin_lock(&sysctl_lock); - if (!use_table(head)) - head = ERR_PTR(-ENOENT); - spin_unlock(&sysctl_lock); - return head; -} - -void sysctl_head_finish(struct ctl_table_header *head) -{ - if (!head) - return; - spin_lock(&sysctl_lock); - unuse_table(head); - spin_unlock(&sysctl_lock); -} - -static struct ctl_table_set * -lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = &root->default_set; - if (root->lookup) - set = root->lookup(root, namespaces); - return set; -} - -static struct list_head * -lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) -{ - struct ctl_table_set *set = lookup_header_set(root, namespaces); - return &set->list; -} - -struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, - struct ctl_table_header *prev) -{ - struct ctl_table_root *root; - struct list_head *header_list; - struct ctl_table_header *head; - struct list_head *tmp; - - spin_lock(&sysctl_lock); - if (prev) { - head = prev; - tmp = &prev->ctl_entry; - unuse_table(prev); - goto next; - } - tmp = &root_table_header.ctl_entry; - for (;;) { - head = list_entry(tmp, struct ctl_table_header, ctl_entry); - - if (!use_table(head)) - goto next; - spin_unlock(&sysctl_lock); - return head; - next: - root = head->root; - tmp = tmp->next; - header_list = lookup_header_list(root, namespaces); - if (tmp != header_list) - continue; - - do { - root = list_entry(root->root_list.next, - struct ctl_table_root, root_list); - if (root == &sysctl_table_root) - goto out; - header_list = lookup_header_list(root, namespaces); - } while (list_empty(header_list)); - tmp = header_list->next; - } -out: - spin_unlock(&sysctl_lock); - return NULL; -} - -struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) -{ - return __sysctl_head_next(current->nsproxy, prev); -} - -void register_sysctl_root(struct ctl_table_root *root) -{ - spin_lock(&sysctl_lock); - list_add_tail(&root->root_list, &sysctl_table_root.root_list); - spin_unlock(&sysctl_lock); -} - -/* - * sysctl_perm does NOT grant the superuser all rights automatically, because - * some sysctl variables are readonly even to root. - */ - -static int test_perm(int mode, int op) -{ - if (!current_euid()) - mode >>= 6; - else if (in_egroup_p(0)) - mode >>= 3; - if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) - return 0; - return -EACCES; -} - -int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) -{ - int mode; - - if (root->permissions) - mode = root->permissions(root, current->nsproxy, table); - else - mode = table->mode; - - return test_perm(mode, op); -} - -static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) -{ - for (; table->procname; table++) { - table->parent = parent; - if (table->child) - sysctl_set_parent(table, table->child); - } -} - int __init sysctl_init(void) { register_sysctl_table(sysctl_base_table); return 0; } -static struct ctl_table *is_branch_in(struct ctl_table *branch, - struct ctl_table *table) -{ - struct ctl_table *p; - const char *s = branch->procname; - - /* branch should have named subdirectory as its first element */ - if (!s || !branch->child) - return NULL; - - /* ... and nothing else */ - if (branch[1].procname) - return NULL; - - /* table should contain subdirectory with the same name */ - for (p = table; p->procname; p++) { - if (!p->child) - continue; - if (p->procname && strcmp(p->procname, s) == 0) - return p; - } - return NULL; -} - -/* see if attaching q to p would be an improvement */ -static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) -{ - struct ctl_table *to = p->ctl_table, *by = q->ctl_table; - struct ctl_table *next; - int is_better = 0; - int not_in_parent = !p->attached_by; - - while ((next = is_branch_in(by, to)) != NULL) { - if (by == q->attached_by) - is_better = 1; - if (to == p->attached_by) - not_in_parent = 1; - by = by->child; - to = next->child; - } - - if (is_better && not_in_parent) { - q->attached_by = by; - q->attached_to = to; - q->parent = p; - } -} - -/** - * __register_sysctl_paths - register a sysctl hierarchy - * @root: List of sysctl headers to register on - * @namespaces: Data to compute which lists of sysctl entries are visible - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * The members of the &struct ctl_table structure are used as follows: - * - * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not - * enter a sysctl file - * - * data - a pointer to data for use by proc_handler - * - * maxlen - the maximum size in bytes of the data - * - * mode - the file permissions for the /proc/sys file, and for sysctl(2) - * - * child - a pointer to the child sysctl table if this entry is a directory, or - * %NULL. - * - * proc_handler - the text handler routine (described below) - * - * de - for internal use by the sysctl routines - * - * extra1, extra2 - extra pointers usable by the proc handler routines - * - * Leaf nodes in the sysctl tree will be represented by a single file - * under /proc; non-leaf nodes will be represented by directories. - * - * sysctl(2) can automatically manage read and write requests through - * the sysctl table. The data and maxlen fields of the ctl_table - * struct enable minimal validation of the values being written to be - * performed, and the mode field allows minimal authentication. - * - * There must be a proc_handler routine for any terminal nodes - * mirrored under /proc/sys (non-terminals are handled by a built-in - * directory handler). Several default handlers are available to - * cover common cases - - * - * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), - * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), - * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() - * - * It is the handler's job to read the input buffer from user memory - * and process it. The handler should return 0 on success. - * - * This routine returns %NULL on a failure to register, and a pointer - * to the table header on success. - */ -struct ctl_table_header *__register_sysctl_paths( - struct ctl_table_root *root, - struct nsproxy *namespaces, - const struct ctl_path *path, struct ctl_table *table) -{ - struct ctl_table_header *header; - struct ctl_table *new, **prevp; - unsigned int n, npath; - struct ctl_table_set *set; - - /* Count the path components */ - for (npath = 0; path[npath].procname; ++npath) - ; - - /* - * For each path component, allocate a 2-element ctl_table array. - * The first array element will be filled with the sysctl entry - * for this, the second will be the sentinel (procname == 0). - * - * We allocate everything in one go so that we don't have to - * worry about freeing additional memory in unregister_sysctl_table. - */ - header = kzalloc(sizeof(struct ctl_table_header) + - (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); - if (!header) - return NULL; - - new = (struct ctl_table *) (header + 1); - - /* Now connect the dots */ - prevp = &header->ctl_table; - for (n = 0; n < npath; ++n, ++path) { - /* Copy the procname */ - new->procname = path->procname; - new->mode = 0555; - - *prevp = new; - prevp = &new->child; - - new += 2; - } - *prevp = table; - header->ctl_table_arg = table; - - INIT_LIST_HEAD(&header->ctl_entry); - header->used = 0; - header->unregistering = NULL; - header->root = root; - sysctl_set_parent(NULL, header->ctl_table); - header->count = 1; -#ifdef CONFIG_SYSCTL_SYSCALL_CHECK - if (sysctl_check_table(namespaces, header->ctl_table)) { - kfree(header); - return NULL; - } -#endif - spin_lock(&sysctl_lock); - header->set = lookup_header_set(root, namespaces); - header->attached_by = header->ctl_table; - header->attached_to = root_table; - header->parent = &root_table_header; - for (set = header->set; set; set = set->parent) { - struct ctl_table_header *p; - list_for_each_entry(p, &set->list, ctl_entry) { - if (p->unregistering) - continue; - try_attach(p, header); - } - } - header->parent->count++; - list_add_tail(&header->ctl_entry, &header->set->list); - spin_unlock(&sysctl_lock); - - return header; -} - -/** - * register_sysctl_table_path - register a sysctl table hierarchy - * @path: The path to the directory the sysctl table is in. - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See __register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, - struct ctl_table *table) -{ - return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, - path, table); -} - -/** - * register_sysctl_table - register a sysctl table hierarchy - * @table: the top-level table structure - * - * Register a sysctl table hierarchy. @table should be a filled in ctl_table - * array. A completely 0 filled entry terminates the table. - * - * See register_sysctl_paths for more details. - */ -struct ctl_table_header *register_sysctl_table(struct ctl_table *table) -{ - static const struct ctl_path null_path[] = { {} }; - - return register_sysctl_paths(null_path, table); -} - -/** - * unregister_sysctl_table - unregister a sysctl table hierarchy - * @header: the header returned from register_sysctl_table - * - * Unregisters the sysctl table and all children. proc entries may not - * actually be removed until they are no longer used by anyone. - */ -void unregister_sysctl_table(struct ctl_table_header * header) -{ - might_sleep(); - - if (header == NULL) - return; - - spin_lock(&sysctl_lock); - start_unregistering(header); - if (!--header->parent->count) { - WARN_ON(1); - kfree_rcu(header->parent, rcu); - } - if (!--header->count) - kfree_rcu(header, rcu); - spin_unlock(&sysctl_lock); -} - -int sysctl_is_seen(struct ctl_table_header *p) -{ - struct ctl_table_set *set = p->set; - int res; - spin_lock(&sysctl_lock); - if (p->unregistering) - res = 0; - else if (!set->is_seen) - res = 1; - else - res = set->is_seen(set); - spin_unlock(&sysctl_lock); - return res; -} - -void setup_sysctl_set(struct ctl_table_set *p, - struct ctl_table_set *parent, - int (*is_seen)(struct ctl_table_set *)) -{ - INIT_LIST_HEAD(&p->list); - p->parent = parent ? parent : &sysctl_table_root.default_set; - p->is_seen = is_seen; -} - #endif /* CONFIG_SYSCTL */ /* @@ -2977,6 +2516,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); -EXPORT_SYMBOL(register_sysctl_table); -EXPORT_SYMBOL(register_sysctl_paths); -EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c deleted file mode 100644 index 362da653813d..000000000000 --- a/kernel/sysctl_check.c +++ /dev/null @@ -1,160 +0,0 @@ -#include -#include -#include "../fs/xfs/xfs_sysctl.h" -#include -#include -#include - - -static int sysctl_depth(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth; - - depth = 0; - for (tmp = table; tmp->parent; tmp = tmp->parent) - depth++; - - return depth; -} - -static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) -{ - int i; - - for (i = 0; table && i < n; i++) - table = table->parent; - - return table; -} - - -static void sysctl_print_path(struct ctl_table *table) -{ - struct ctl_table *tmp; - int depth, i; - depth = sysctl_depth(table); - if (table->procname) { - for (i = depth; i >= 0; i--) { - tmp = sysctl_parent(table, i); - printk("/%s", tmp->procname?tmp->procname:""); - } - } - printk(" "); -} - -static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, - struct ctl_table *table) -{ - struct ctl_table_header *head; - struct ctl_table *ref, *test; - int depth, cur_depth; - - depth = sysctl_depth(table); - - for (head = __sysctl_head_next(namespaces, NULL); head; - head = __sysctl_head_next(namespaces, head)) { - cur_depth = depth; - ref = head->ctl_table; -repeat: - test = sysctl_parent(table, cur_depth); - for (; ref->procname; ref++) { - int match = 0; - if (cur_depth && !ref->child) - continue; - - if (test->procname && ref->procname && - (strcmp(test->procname, ref->procname) == 0)) - match++; - - if (match) { - if (cur_depth != 0) { - cur_depth--; - ref = ref->child; - goto repeat; - } - goto out; - } - } - } - ref = NULL; -out: - sysctl_head_finish(head); - return ref; -} - -static void set_fail(const char **fail, struct ctl_table *table, const char *str) -{ - if (*fail) { - printk(KERN_ERR "sysctl table check failed: "); - sysctl_print_path(table); - printk(" %s\n", *fail); - dump_stack(); - } - *fail = str; -} - -static void sysctl_check_leaf(struct nsproxy *namespaces, - struct ctl_table *table, const char **fail) -{ - struct ctl_table *ref; - - ref = sysctl_check_lookup(namespaces, table); - if (ref && (ref != table)) - set_fail(fail, table, "Sysctl already exists"); -} - -int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) -{ - int error = 0; - for (; table->procname; table++) { - const char *fail = NULL; - - if (table->parent) { - if (!table->parent->procname) - set_fail(&fail, table, "Parent without procname"); - } - if (table->child) { - if (table->data) - set_fail(&fail, table, "Directory with data?"); - if (table->maxlen) - set_fail(&fail, table, "Directory with maxlen?"); - if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) - set_fail(&fail, table, "Writable sysctl directory"); - if (table->proc_handler) - set_fail(&fail, table, "Directory with proc_handler"); - if (table->extra1) - set_fail(&fail, table, "Directory with extra1"); - if (table->extra2) - set_fail(&fail, table, "Directory with extra2"); - } else { - if ((table->proc_handler == proc_dostring) || - (table->proc_handler == proc_dointvec) || - (table->proc_handler == proc_dointvec_minmax) || - (table->proc_handler == proc_dointvec_jiffies) || - (table->proc_handler == proc_dointvec_userhz_jiffies) || - (table->proc_handler == proc_dointvec_ms_jiffies) || - (table->proc_handler == proc_doulongvec_minmax) || - (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { - if (!table->data) - set_fail(&fail, table, "No data"); - if (!table->maxlen) - set_fail(&fail, table, "No maxlen"); - } -#ifdef CONFIG_PROC_SYSCTL - if (!table->proc_handler) - set_fail(&fail, table, "No proc_handler"); -#endif - sysctl_check_leaf(namespaces, table, &fail); - } - if (table->mode > 0777) - set_fail(&fail, table, "bogus .mode"); - if (fail) { - set_fail(&fail, table, NULL); - error = -EINVAL; - } - if (table->child) - error |= sysctl_check_table(namespaces, table->child); - } - return error; -} -- cgit v1.2.3 From 2ed86b16eabe4efbf80cc725a8cbb5310746a2fc Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 25 Jan 2012 20:02:40 -0600 Subject: irq: make SPARSE_IRQ an optionally hidden option On ARM, we don't want SPARSE_IRQ to be a user visible option. Make SPARSE_IRQ visible based on MAY_HAVE_SPARSE_IRQ instead of depending on HAVE_SPARSE_IRQ. With this, SPARSE_IRQ is not visible on C6X and ARM. Signed-off-by: Rob Herring Cc: Russell King Cc: Mark Salter Cc: Aurelien Jacquiot Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Mundt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linux-c6x-dev@linux-c6x.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-sh@vger.kernel.org --- arch/arm/Kconfig | 1 - arch/c6x/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/sh/Kconfig | 2 +- arch/x86/Kconfig | 1 - kernel/irq/Kconfig | 5 ++--- 6 files changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 24626b0419ee..30e7840498ce 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -28,7 +28,6 @@ config ARM select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)) select HAVE_C_RECORDMCOUNT select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ select GENERIC_IRQ_SHOW select CPU_PM if (SUSPEND || CPU_IDLE) select GENERIC_PCI_IOMAP diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 26e67f0f0051..2f58c61e2812 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -11,7 +11,7 @@ config TMS320C6X select HAVE_DMA_API_DEBUG select HAVE_GENERIC_HARDIRQS select HAVE_MEMBLOCK - select HAVE_SPARSE_IRQ + select SPARSE_IRQ select OF select OF_EARLY_FLATTREE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1919634a9b32..06c1cf0f24a6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -133,7 +133,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ + select MAY_HAVE_SPARSE_IRQ select IRQ_PER_CPU select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 3c8db65c89e5..21b82a8cca21 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -22,7 +22,7 @@ config SUPERH select HAVE_SYSCALL_TRACEPOINTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ + select MAY_HAVE_SPARSE_IRQ select IRQ_FORCED_THREADING select RTC_LIB select GENERIC_ATOMIC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 864cc6e6ac8e..fb2da445945f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS - select HAVE_SPARSE_IRQ select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT select GENERIC_IRQ_PROBE diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5a38bf4de641..1f2dece9ad4c 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -13,7 +13,7 @@ config GENERIC_HARDIRQS # Options selectable by the architecture code # Make sparse irq Kconfig switch below available -config HAVE_SPARSE_IRQ +config MAY_HAVE_SPARSE_IRQ bool # Enable the generic irq autoprobe mechanism @@ -61,8 +61,7 @@ config IRQ_FORCED_THREADING bool config SPARSE_IRQ - bool "Support sparse irq numbering" - depends on HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ ---help--- Sparse irq numbering is useful for distro kernels that want -- cgit v1.2.3 From 00c5fb774e3fa8c9d082c62eac7e3d178c006f56 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 11:23:15 -0800 Subject: time: Move total_sleep_time into the timekeeper structure Move total_sleep_time into the timekeeper structure in preparation for locking cleanups CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0c6358186401..8427cc20bad6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -47,6 +47,10 @@ struct timekeeper { int ntp_error_shift; /* NTP adjusted clock multiplier */ u32 mult; + + /* time spent in suspend */ + struct timespec total_sleep_time; + }; static struct timekeeper timekeeper; @@ -159,7 +163,6 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); */ static struct timespec xtime __attribute__ ((aligned (16))); static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); -static struct timespec total_sleep_time; /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -591,8 +594,8 @@ void __init timekeeping_init(void) } set_normalized_timespec(&wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); - total_sleep_time.tv_sec = 0; - total_sleep_time.tv_nsec = 0; + timekeeper.total_sleep_time.tv_sec = 0; + timekeeper.total_sleep_time.tv_nsec = 0; write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -616,7 +619,8 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) xtime = timespec_add(xtime, *delta); wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); - total_sleep_time = timespec_add(total_sleep_time, *delta); + timekeeper.total_sleep_time = timespec_add( + timekeeper.total_sleep_time, *delta); } @@ -1074,8 +1078,10 @@ static void update_wall_time(void) void getboottime(struct timespec *ts) { struct timespec boottime = { - .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, - .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + .tv_sec = wall_to_monotonic.tv_sec + + timekeeper.total_sleep_time.tv_sec, + .tv_nsec = wall_to_monotonic.tv_nsec + + timekeeper.total_sleep_time.tv_nsec }; set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); @@ -1104,7 +1110,7 @@ void get_monotonic_boottime(struct timespec *ts) seq = read_seqbegin(&xtime_lock); *ts = xtime; tomono = wall_to_monotonic; - sleep = total_sleep_time; + sleep = timekeeper.total_sleep_time; nsecs = timekeeping_get_ns(); } while (read_seqretry(&xtime_lock, seq)); @@ -1137,7 +1143,7 @@ EXPORT_SYMBOL_GPL(ktime_get_boottime); */ void monotonic_to_bootbased(struct timespec *ts) { - *ts = timespec_add(*ts, total_sleep_time); + *ts = timespec_add(*ts, timekeeper.total_sleep_time); } EXPORT_SYMBOL_GPL(monotonic_to_bootbased); @@ -1212,7 +1218,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, seq = read_seqbegin(&xtime_lock); *xtim = xtime; *wtom = wall_to_monotonic; - *sleep = total_sleep_time; + *sleep = timekeeper.total_sleep_time; } while (read_seqretry(&xtime_lock, seq)); } -- cgit v1.2.3 From d9f7217aac6833cc634741f2f771a87fd1518fee Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 11:29:32 -0800 Subject: time: Move wall_to_monotonic into the timekeeper structure In preparation for locking cleanups, move wall_to_monotonic into the timekeeper structure. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 69 +++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8427cc20bad6..5655ca3a86d7 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -48,6 +48,21 @@ struct timekeeper { /* NTP adjusted clock multiplier */ u32 mult; + /* + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + * + * wall_to_monotonic is moved after resume from suspend for the + * monotonic time not to jump. We need to add total_sleep_time to + * wall_to_monotonic to get the real boot based time offset. + * + * - wall_to_monotonic is no longer the boot time, getboottime must be + * used instead. + */ + struct timespec wall_to_monotonic; /* time spent in suspend */ struct timespec total_sleep_time; @@ -148,21 +163,8 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); /* * The current time - * wall_to_monotonic is what we need to add to xtime (or xtime corrected - * for sub jiffie times) to get to monotonic time. Monotonic is pegged - * at zero at system boot time, so wall_to_monotonic will be negative, - * however, we will ALWAYS keep the tv_nsec part positive so we can use - * the usual normalization. - * - * wall_to_monotonic is moved after resume from suspend for the monotonic - * time not to jump. We need to add total_sleep_time to wall_to_monotonic - * to get the real boot based time offset. - * - * - wall_to_monotonic is no longer the boot time, getboottime must be - * used instead. */ static struct timespec xtime __attribute__ ((aligned (16))); -static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -176,8 +178,8 @@ int __read_mostly timekeeping_suspended; void timekeeping_leap_insert(int leapsecond) { xtime.tv_sec += leapsecond; - wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -249,8 +251,8 @@ ktime_t ktime_get(void) do { seq = read_seqbegin(&xtime_lock); - secs = xtime.tv_sec + wall_to_monotonic.tv_sec; - nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + secs = xtime.tv_sec + timekeeper.wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + timekeeper.wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -283,7 +285,7 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); *ts = xtime; - tomono = wall_to_monotonic; + tomono = timekeeper.wall_to_monotonic; nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -374,14 +376,15 @@ int do_settimeofday(const struct timespec *tv) ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; - wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); + timekeeper.wall_to_monotonic = + timespec_sub(timekeeper.wall_to_monotonic, ts_delta); xtime = *tv; timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -413,12 +416,13 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(); xtime = timespec_add(xtime, *ts); - wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + timekeeper.wall_to_monotonic = + timespec_sub(timekeeper.wall_to_monotonic, *ts); timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -592,7 +596,7 @@ void __init timekeeping_init(void) boot.tv_sec = xtime.tv_sec; boot.tv_nsec = xtime.tv_nsec; } - set_normalized_timespec(&wall_to_monotonic, + set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; @@ -618,7 +622,8 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) } xtime = timespec_add(xtime, *delta); - wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); + timekeeper.wall_to_monotonic = + timespec_sub(timekeeper.wall_to_monotonic, *delta); timekeeper.total_sleep_time = timespec_add( timekeeper.total_sleep_time, *delta); } @@ -651,7 +656,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -1060,7 +1065,7 @@ static void update_wall_time(void) } /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); } @@ -1078,9 +1083,9 @@ static void update_wall_time(void) void getboottime(struct timespec *ts) { struct timespec boottime = { - .tv_sec = wall_to_monotonic.tv_sec + + .tv_sec = timekeeper.wall_to_monotonic.tv_sec + timekeeper.total_sleep_time.tv_sec, - .tv_nsec = wall_to_monotonic.tv_nsec + + .tv_nsec = timekeeper.wall_to_monotonic.tv_nsec + timekeeper.total_sleep_time.tv_nsec }; @@ -1109,7 +1114,7 @@ void get_monotonic_boottime(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); *ts = xtime; - tomono = wall_to_monotonic; + tomono = timekeeper.wall_to_monotonic; sleep = timekeeper.total_sleep_time; nsecs = timekeeping_get_ns(); @@ -1182,7 +1187,7 @@ struct timespec get_monotonic_coarse(void) seq = read_seqbegin(&xtime_lock); now = xtime; - mono = wall_to_monotonic; + mono = timekeeper.wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, @@ -1217,7 +1222,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, do { seq = read_seqbegin(&xtime_lock); *xtim = xtime; - *wtom = wall_to_monotonic; + *wtom = timekeeper.wall_to_monotonic; *sleep = timekeeper.total_sleep_time; } while (read_seqretry(&xtime_lock, seq)); } @@ -1232,7 +1237,7 @@ ktime_t ktime_get_monotonic_offset(void) do { seq = read_seqbegin(&xtime_lock); - wtom = wall_to_monotonic; + wtom = timekeeper.wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); return timespec_to_ktime(wtom); } -- cgit v1.2.3 From 8ff2cb92dd1afcf23e7b5287c43a900b16f40bad Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 11:40:54 -0800 Subject: time: Move xtime into timekeeeper structure In preparation for locking cleanups, move xtime into timekeeper structure. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 91 ++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5655ca3a86d7..b30ffe6c6b06 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -48,6 +48,8 @@ struct timekeeper { /* NTP adjusted clock multiplier */ u32 mult; + /* The current time */ + struct timespec xtime; /* * wall_to_monotonic is what we need to add to xtime (or xtime corrected * for sub jiffie times) to get to monotonic time. Monotonic is pegged @@ -161,10 +163,6 @@ static inline s64 timekeeping_get_ns_raw(void) __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); -/* - * The current time - */ -static struct timespec xtime __attribute__ ((aligned (16))); /* * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. @@ -177,10 +175,10 @@ int __read_mostly timekeeping_suspended; /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { - xtime.tv_sec += leapsecond; + timekeeper.xtime.tv_sec += leapsecond; timekeeper.wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); } /** @@ -207,7 +205,7 @@ static void timekeeping_forward_now(void) /* If arch requires, add in gettimeoffset() */ nsec += arch_gettimeoffset(); - timespec_add_ns(&xtime, nsec); + timespec_add_ns(&timekeeper.xtime, nsec); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); timespec_add_ns(&raw_time, nsec); @@ -229,7 +227,7 @@ void getnstimeofday(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); - *ts = xtime; + *ts = timekeeper.xtime; nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ @@ -251,8 +249,10 @@ ktime_t ktime_get(void) do { seq = read_seqbegin(&xtime_lock); - secs = xtime.tv_sec + timekeeper.wall_to_monotonic.tv_sec; - nsecs = xtime.tv_nsec + timekeeper.wall_to_monotonic.tv_nsec; + secs = timekeeper.xtime.tv_sec + + timekeeper.wall_to_monotonic.tv_sec; + nsecs = timekeeper.xtime.tv_nsec + + timekeeper.wall_to_monotonic.tv_nsec; nsecs += timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); @@ -284,7 +284,7 @@ void ktime_get_ts(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); - *ts = xtime; + *ts = timekeeper.xtime; tomono = timekeeper.wall_to_monotonic; nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ @@ -321,7 +321,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) seq = read_seqbegin(&xtime_lock); *ts_raw = raw_time; - *ts_real = xtime; + *ts_real = timekeeper.xtime; nsecs_raw = timekeeping_get_ns_raw(); nsecs_real = timekeeping_get_ns(); @@ -374,18 +374,18 @@ int do_settimeofday(const struct timespec *tv) timekeeping_forward_now(); - ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; - ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + ts_delta.tv_sec = tv->tv_sec - timekeeper.xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - timekeeper.xtime.tv_nsec; timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, ts_delta); - xtime = *tv; + timekeeper.xtime = *tv; timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -415,15 +415,15 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeping_forward_now(); - xtime = timespec_add(xtime, *ts); + timekeeper.xtime = timespec_add(timekeeper.xtime, *ts); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *ts); timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -588,13 +588,13 @@ void __init timekeeping_init(void) clock->enable(clock); timekeeper_setup_internals(clock); - xtime.tv_sec = now.tv_sec; - xtime.tv_nsec = now.tv_nsec; + timekeeper.xtime.tv_sec = now.tv_sec; + timekeeper.xtime.tv_nsec = now.tv_nsec; raw_time.tv_sec = 0; raw_time.tv_nsec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) { - boot.tv_sec = xtime.tv_sec; - boot.tv_nsec = xtime.tv_nsec; + boot.tv_sec = timekeeper.xtime.tv_sec; + boot.tv_nsec = timekeeper.xtime.tv_nsec; } set_normalized_timespec(&timekeeper.wall_to_monotonic, -boot.tv_sec, -boot.tv_nsec); @@ -621,7 +621,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) return; } - xtime = timespec_add(xtime, *delta); + timekeeper.xtime = timespec_add(timekeeper.xtime, *delta); timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *delta); timekeeper.total_sleep_time = timespec_add( @@ -656,8 +656,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta) timekeeper.ntp_error = 0; ntp_clear(); - update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); write_sequnlock_irqrestore(&xtime_lock, flags); @@ -720,7 +720,7 @@ static int timekeeping_suspend(void) * try to compensate so the difference in system time * and persistent_clock time stays close to constant. */ - delta = timespec_sub(xtime, timekeeping_suspend_time); + delta = timespec_sub(timekeeper.xtime, timekeeping_suspend_time); delta_delta = timespec_sub(delta, old_delta); if (abs(delta_delta.tv_sec) >= 2) { /* @@ -952,7 +952,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; while (timekeeper.xtime_nsec >= nsecps) { timekeeper.xtime_nsec -= nsecps; - xtime.tv_sec++; + timekeeper.xtime.tv_sec++; second_overflow(); } @@ -998,7 +998,8 @@ static void update_wall_time(void) #else offset = (clock->read(clock) - clock->cycle_last) & clock->mask; #endif - timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; + timekeeper.xtime_nsec = (s64)timekeeper.xtime.tv_nsec << + timekeeper.shift; /* * With NO_HZ we may have to accumulate many cycle_intervals @@ -1049,8 +1050,10 @@ static void update_wall_time(void) * Store full nanoseconds into xtime after rounding it up and * add the remainder to the error difference. */ - xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; - timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.xtime.tv_nsec = ((s64)timekeeper.xtime_nsec >> + timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64)timekeeper.xtime.tv_nsec << + timekeeper.shift; timekeeper.ntp_error += timekeeper.xtime_nsec << timekeeper.ntp_error_shift; @@ -1058,15 +1061,15 @@ static void update_wall_time(void) * Finally, make sure that after the rounding * xtime.tv_nsec isn't larger then NSEC_PER_SEC */ - if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { - xtime.tv_nsec -= NSEC_PER_SEC; - xtime.tv_sec++; + if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { + timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; + timekeeper.xtime.tv_sec++; second_overflow(); } /* check to see if there is a new clocksource to use */ - update_vsyscall(&xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, - timekeeper.mult); + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); } /** @@ -1113,7 +1116,7 @@ void get_monotonic_boottime(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); - *ts = xtime; + *ts = timekeeper.xtime; tomono = timekeeper.wall_to_monotonic; sleep = timekeeper.total_sleep_time; nsecs = timekeeping_get_ns(); @@ -1154,13 +1157,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { - return xtime.tv_sec; + return timekeeper.xtime.tv_sec; } EXPORT_SYMBOL(get_seconds); struct timespec __current_kernel_time(void) { - return xtime; + return timekeeper.xtime; } struct timespec current_kernel_time(void) @@ -1171,7 +1174,7 @@ struct timespec current_kernel_time(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + now = timekeeper.xtime; } while (read_seqretry(&xtime_lock, seq)); return now; @@ -1186,7 +1189,7 @@ struct timespec get_monotonic_coarse(void) do { seq = read_seqbegin(&xtime_lock); - now = xtime; + now = timekeeper.xtime; mono = timekeeper.wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); @@ -1221,7 +1224,7 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, do { seq = read_seqbegin(&xtime_lock); - *xtim = xtime; + *xtim = timekeeper.xtime; *wtom = timekeeper.wall_to_monotonic; *sleep = timekeeper.total_sleep_time; } while (read_seqretry(&xtime_lock, seq)); -- cgit v1.2.3 From 01f71b47e08f2a062c4e77c94dfa9a7e0ae65fcb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 11:43:49 -0800 Subject: time: Move raw_time into timekeeper structure In preparation for locking cleanups, move raw_time into timekeeper structure. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b30ffe6c6b06..fbbc3c7ce7df 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -67,7 +67,8 @@ struct timekeeper { struct timespec wall_to_monotonic; /* time spent in suspend */ struct timespec total_sleep_time; - + /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ + struct timespec raw_time; }; static struct timekeeper timekeeper; @@ -164,10 +165,6 @@ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); -/* - * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. - */ -static struct timespec raw_time; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -208,7 +205,7 @@ static void timekeeping_forward_now(void) timespec_add_ns(&timekeeper.xtime, nsec); nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); - timespec_add_ns(&raw_time, nsec); + timespec_add_ns(&timekeeper.raw_time, nsec); } /** @@ -320,7 +317,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) seq = read_seqbegin(&xtime_lock); - *ts_raw = raw_time; + *ts_raw = timekeeper.raw_time; *ts_real = timekeeper.xtime; nsecs_raw = timekeeping_get_ns_raw(); @@ -499,7 +496,7 @@ void getrawmonotonic(struct timespec *ts) do { seq = read_seqbegin(&xtime_lock); nsecs = timekeeping_get_ns_raw(); - *ts = raw_time; + *ts = timekeeper.raw_time; } while (read_seqretry(&xtime_lock, seq)); @@ -590,8 +587,8 @@ void __init timekeeping_init(void) timekeeper.xtime.tv_sec = now.tv_sec; timekeeper.xtime.tv_nsec = now.tv_nsec; - raw_time.tv_sec = 0; - raw_time.tv_nsec = 0; + timekeeper.raw_time.tv_sec = 0; + timekeeper.raw_time.tv_nsec = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) { boot.tv_sec = timekeeper.xtime.tv_sec; boot.tv_nsec = timekeeper.xtime.tv_nsec; @@ -958,13 +955,13 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) /* Accumulate raw time */ raw_nsecs = timekeeper.raw_interval << shift; - raw_nsecs += raw_time.tv_nsec; + raw_nsecs += timekeeper.raw_time.tv_nsec; if (raw_nsecs >= NSEC_PER_SEC) { u64 raw_secs = raw_nsecs; raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); - raw_time.tv_sec += raw_secs; + timekeeper.raw_time.tv_sec += raw_secs; } - raw_time.tv_nsec = raw_nsecs; + timekeeper.raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ timekeeper.ntp_error += tick_length << shift; -- cgit v1.2.3 From 8fcce546be16130865550136831f71097d7fc228 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 11:46:39 -0800 Subject: time: Cleanup global variables and move them to the top Move global xtime_lock and timekeeping_suspended values up to the top of timekeeping.c CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fbbc3c7ce7df..5df2e7e556ca 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -73,6 +73,18 @@ struct timekeeper { static struct timekeeper timekeeper; +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + + + /** * timekeeper_setup_internals - Set up internals to use clocksource clock. * @@ -157,18 +169,6 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } -/* - * This read-write spinlock protects us from races in SMP while - * playing with xtime. - */ -__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); - - - - -/* flag for if timekeeping is suspended */ -int __read_mostly timekeeping_suspended; - /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { -- cgit v1.2.3 From 70471f2f061d59375e959b4e7d47ee62121babb1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 12:48:10 -0800 Subject: time: Add timekeeper lock Now that all the timekeeping variables are stored in the timekeeper structure, add a new lock to protect the structure. For now, this lock nests under the xtime_lock for writes. For readers, we don't need to take xtime_lock anymore. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 126 ++++++++++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5df2e7e556ca..f5d4d226defb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -69,6 +69,9 @@ struct timekeeper { struct timespec total_sleep_time; /* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */ struct timespec raw_time; + + /* Seqlock for all timekeeper values */ + seqlock_t lock; }; static struct timekeeper timekeeper; @@ -172,10 +175,17 @@ static inline s64 timekeeping_get_ns_raw(void) /* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { + unsigned long flags; + + write_seqlock_irqsave(&timekeeper.lock, flags); + timekeeper.xtime.tv_sec += leapsecond; timekeeper.wall_to_monotonic.tv_sec -= leapsecond; update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); + + write_sequnlock_irqrestore(&timekeeper.lock, flags); + } /** @@ -222,7 +232,7 @@ void getnstimeofday(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); *ts = timekeeper.xtime; nsecs = timekeeping_get_ns(); @@ -230,7 +240,7 @@ void getnstimeofday(struct timespec *ts) /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts, nsecs); } @@ -245,7 +255,7 @@ ktime_t ktime_get(void) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); secs = timekeeper.xtime.tv_sec + timekeeper.wall_to_monotonic.tv_sec; nsecs = timekeeper.xtime.tv_nsec + @@ -254,7 +264,7 @@ ktime_t ktime_get(void) /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); /* * Use ktime_set/ktime_add_ns to create a proper ktime on * 32-bit architectures without CONFIG_KTIME_SCALAR. @@ -280,14 +290,14 @@ void ktime_get_ts(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); *ts = timekeeper.xtime; tomono = timekeeper.wall_to_monotonic; nsecs = timekeeping_get_ns(); /* If arch requires, add in gettimeoffset() */ nsecs += arch_gettimeoffset(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, ts->tv_nsec + tomono.tv_nsec + nsecs); @@ -315,7 +325,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) do { u32 arch_offset; - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); *ts_raw = timekeeper.raw_time; *ts_real = timekeeper.xtime; @@ -328,7 +338,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) nsecs_raw += arch_offset; nsecs_real += arch_offset; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts_raw, nsecs_raw); timespec_add_ns(ts_real, nsecs_real); @@ -362,12 +372,13 @@ EXPORT_SYMBOL(do_gettimeofday); int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; - unsigned long flags; + unsigned long flags1,flags2; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&xtime_lock, flags1); + write_seqlock_irqsave(&timekeeper.lock, flags2); timekeeping_forward_now(); @@ -384,7 +395,8 @@ int do_settimeofday(const struct timespec *tv) update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags2); + write_sequnlock_irqrestore(&xtime_lock, flags1); /* signal hrtimers about time change */ clock_was_set(); @@ -403,12 +415,13 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { - unsigned long flags; + unsigned long flags1,flags2; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&xtime_lock, flags1); + write_seqlock_irqsave(&timekeeper.lock, flags2); timekeeping_forward_now(); @@ -422,7 +435,8 @@ int timekeeping_inject_offset(struct timespec *ts) update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags2); + write_sequnlock_irqrestore(&xtime_lock, flags1); /* signal hrtimers about time change */ clock_was_set(); @@ -494,11 +508,11 @@ void getrawmonotonic(struct timespec *ts) s64 nsecs; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); nsecs = timekeeping_get_ns_raw(); *ts = timekeeper.raw_time; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); timespec_add_ns(ts, nsecs); } @@ -514,24 +528,30 @@ int timekeeping_valid_for_hres(void) int ret; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); return ret; } /** * timekeeping_max_deferment - Returns max time the clocksource can be deferred - * - * Caller must observe xtime_lock via read_seqbegin/read_seqretry to - * ensure that the clocksource does not change! */ u64 timekeeping_max_deferment(void) { - return timekeeper.clock->max_idle_ns; + unsigned long seq; + u64 ret; + do { + seq = read_seqbegin(&timekeeper.lock); + + ret = timekeeper.clock->max_idle_ns; + + } while (read_seqretry(&timekeeper.lock, seq)); + + return ret; } /** @@ -576,10 +596,13 @@ void __init timekeeping_init(void) read_persistent_clock(&now); read_boot_clock(&boot); - write_seqlock_irqsave(&xtime_lock, flags); + seqlock_init(&timekeeper.lock); + write_seqlock_irqsave(&xtime_lock, flags); ntp_init(); + write_sequnlock_irqrestore(&xtime_lock, flags); + write_seqlock_irqsave(&timekeeper.lock, flags); clock = clocksource_default_clock(); if (clock->enable) clock->enable(clock); @@ -597,7 +620,7 @@ void __init timekeeping_init(void) -boot.tv_sec, -boot.tv_nsec); timekeeper.total_sleep_time.tv_sec = 0; timekeeper.total_sleep_time.tv_nsec = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags); } /* time in seconds when suspend began */ @@ -638,7 +661,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) */ void timekeeping_inject_sleeptime(struct timespec *delta) { - unsigned long flags; + unsigned long flags1,flags2; struct timespec ts; /* Make sure we don't set the clock twice */ @@ -646,7 +669,9 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&xtime_lock, flags1); + write_seqlock_irqsave(&timekeeper.lock, flags2); + timekeeping_forward_now(); __timekeeping_inject_sleeptime(delta); @@ -656,7 +681,8 @@ void timekeeping_inject_sleeptime(struct timespec *delta) update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags2); + write_sequnlock_irqrestore(&xtime_lock, flags1); /* signal hrtimers about time change */ clock_was_set(); @@ -672,14 +698,15 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { - unsigned long flags; + unsigned long flags1,flags2; struct timespec ts; read_persistent_clock(&ts); clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&xtime_lock, flags1); + write_seqlock_irqsave(&timekeeper.lock, flags2); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); @@ -689,7 +716,8 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags2); + write_sequnlock_irqrestore(&xtime_lock, flags1); touch_softlockup_watchdog(); @@ -701,13 +729,14 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { - unsigned long flags; + unsigned long flags1,flags2; struct timespec delta, delta_delta; static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&xtime_lock, flags); + write_seqlock_irqsave(&xtime_lock, flags1); + write_seqlock_irqsave(&timekeeper.lock, flags2); timekeeping_forward_now(); timekeeping_suspended = 1; @@ -730,7 +759,8 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&xtime_lock, flags); + write_sequnlock_irqrestore(&timekeeper.lock, flags2); + write_sequnlock_irqrestore(&xtime_lock, flags1); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -983,10 +1013,13 @@ static void update_wall_time(void) struct clocksource *clock; cycle_t offset; int shift = 0, maxshift; + unsigned long flags; + + write_seqlock_irqsave(&timekeeper.lock, flags); /* Make sure we're fully resumed: */ if (unlikely(timekeeping_suspended)) - return; + goto out; clock = timekeeper.clock; @@ -1067,6 +1100,10 @@ static void update_wall_time(void) /* check to see if there is a new clocksource to use */ update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); + +out: + write_sequnlock_irqrestore(&timekeeper.lock, flags); + } /** @@ -1112,13 +1149,13 @@ void get_monotonic_boottime(struct timespec *ts) WARN_ON(timekeeping_suspended); do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); *ts = timekeeper.xtime; tomono = timekeeper.wall_to_monotonic; sleep = timekeeper.total_sleep_time; nsecs = timekeeping_get_ns(); - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); @@ -1169,10 +1206,10 @@ struct timespec current_kernel_time(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); now = timekeeper.xtime; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); return now; } @@ -1184,11 +1221,11 @@ struct timespec get_monotonic_coarse(void) unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); now = timekeeper.xtime; mono = timekeeper.wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, now.tv_nsec + mono.tv_nsec); @@ -1220,11 +1257,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, unsigned long seq; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); *xtim = timekeeper.xtime; *wtom = timekeeper.wall_to_monotonic; *sleep = timekeeper.total_sleep_time; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); } /** @@ -1236,9 +1273,10 @@ ktime_t ktime_get_monotonic_offset(void) struct timespec wtom; do { - seq = read_seqbegin(&xtime_lock); + seq = read_seqbegin(&timekeeper.lock); wtom = timekeeper.wall_to_monotonic; - } while (read_seqretry(&xtime_lock, seq)); + } while (read_seqretry(&timekeeper.lock, seq)); + return timespec_to_ktime(wtom); } -- cgit v1.2.3 From 8357929e6ae3661d5a3a7378a717f29873ea18c6 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 13:06:21 -0800 Subject: ntp: Cleanup timex.h Move ntp_sycned to ntp.c and mark time_status as static. Also yank function declaration for non-existant function. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- include/linux/timex.h | 15 --------------- kernel/time/ntp.c | 13 ++++++++++++- 2 files changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/timex.h b/include/linux/timex.h index aa60fe7b6ed6..92e01fc31ca6 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -234,23 +234,9 @@ struct timex { extern unsigned long tick_usec; /* USER_HZ period (usec) */ extern unsigned long tick_nsec; /* ACTHZ period (nsec) */ -/* - * phase-lock loop variables - */ -extern int time_status; /* clock synchronization status bits */ - extern void ntp_init(void); extern void ntp_clear(void); -/** - * ntp_synced - Returns 1 if the NTP status is not UNSYNC - * - */ -static inline int ntp_synced(void) -{ - return !(time_status & STA_UNSYNC); -} - /* Required to safely shift negative values */ #define shift_right(x, s) ({ \ __typeof__(x) __x = (x); \ @@ -267,7 +253,6 @@ static inline int ntp_synced(void) extern u64 tick_length; extern void second_overflow(void); -extern void update_ntp_one_tick(void); extern int do_adjtimex(struct timex *); extern void hardpps(const struct timespec *, const struct timespec *); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f6117a4c7cb8..ae7e13607d91 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -49,7 +49,7 @@ static struct hrtimer leap_timer; static int time_state = TIME_OK; /* clock status bits: */ -int time_status = STA_UNSYNC; +static int time_status = STA_UNSYNC; /* TAI offset (secs): */ static long time_tai; @@ -233,6 +233,17 @@ static inline void pps_fill_timex(struct timex *txc) #endif /* CONFIG_NTP_PPS */ + +/** + * ntp_synced - Returns 1 if the NTP status is not UNSYNC + * + */ +static inline int ntp_synced(void) +{ + return !(time_status & STA_UNSYNC); +} + + /* * NTP methods: */ -- cgit v1.2.3 From ea7cf49a7633c2b70125f59b4e3553d9181cb15d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 13:18:07 -0800 Subject: ntp: Access tick_length variable via ntp_tick_length() Currently the NTP managed tick_length value is accessed globally, in preparations for locking cleanups, make sure it is accessed via a function and mark it as static. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- include/linux/timex.h | 2 +- kernel/time/ntp.c | 9 ++++++++- kernel/time/timekeeping.c | 6 +++--- 3 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/timex.h b/include/linux/timex.h index 92e01fc31ca6..b75e1864ed19 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -250,7 +250,7 @@ extern void ntp_clear(void); #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ) /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ -extern u64 tick_length; +extern u64 ntp_tick_length(void); extern void second_overflow(void); extern int do_adjtimex(struct timex *); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ae7e13607d91..f131ba62da62 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -28,7 +28,7 @@ unsigned long tick_usec = TICK_USEC; /* ACTHZ period (nsecs): */ unsigned long tick_nsec; -u64 tick_length; +static u64 tick_length; static u64 tick_length_base; static struct hrtimer leap_timer; @@ -360,6 +360,13 @@ void ntp_clear(void) pps_clear(); } + +u64 ntp_tick_length(void) +{ + return tick_length; +} + + /* * Leap second processing. If in leap-insert state at the end of the * day, the system clock is set back one second; if in leap-delete diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f5d4d226defb..cdae24655c8d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -811,7 +811,7 @@ static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, * Now calculate the error in (1 << look_ahead) ticks, but first * remove the single look ahead already included in the error. */ - tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); + tick_error = ntp_tick_length() >> (timekeeper.ntp_error_shift + 1); tick_error -= timekeeper.xtime_interval >> 1; error = ((error - tick_error) >> look_ahead) + tick_error; @@ -994,7 +994,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.raw_time.tv_nsec = raw_nsecs; /* Accumulate error between NTP and clock interval */ - timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error += ntp_tick_length() << shift; timekeeper.ntp_error -= (timekeeper.xtime_interval + timekeeper.xtime_remainder) << (timekeeper.ntp_error_shift + shift); @@ -1042,7 +1042,7 @@ static void update_wall_time(void) shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); shift = max(0, shift); /* Bound shift to one less then what overflows tick_length */ - maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1; shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { offset = logarithmic_accumulation(offset, shift); -- cgit v1.2.3 From bd3312681f69207a40431981c1bce1afdc9b7975 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 13:48:36 -0800 Subject: ntp: Add ntp_lock to replace xtime_locking Use a ntp_lock spin lock to replace xtime_lock locking in ntp.c CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/ntp.c | 63 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index f131ba62da62..17fb1b9807d0 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -22,6 +22,9 @@ * NTP timekeeping variables: */ +DEFINE_SPINLOCK(ntp_lock); + + /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; @@ -133,7 +136,7 @@ static inline void pps_reset_freq_interval(void) /** * pps_clear - Clears the PPS state variables * - * Must be called while holding a write on the xtime_lock + * Must be called while holding a write on the ntp_lock */ static inline void pps_clear(void) { @@ -149,7 +152,7 @@ static inline void pps_clear(void) * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. * - * Must be called while holding a write on the xtime_lock + * Must be called while holding a write on the ntp_lock */ static inline void pps_dec_valid(void) { @@ -341,11 +344,13 @@ static void ntp_update_offset(long offset) /** * ntp_clear - Clears the NTP state variables - * - * Must be called while holding a write on the xtime_lock */ void ntp_clear(void) { + unsigned long flags; + + spin_lock_irqsave(&ntp_lock, flags); + time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; @@ -358,12 +363,20 @@ void ntp_clear(void) /* Clear PPS state variables */ pps_clear(); + spin_unlock_irqrestore(&ntp_lock, flags); + } u64 ntp_tick_length(void) { - return tick_length; + unsigned long flags; + s64 ret; + + spin_lock_irqsave(&ntp_lock, flags); + ret = tick_length; + spin_unlock_irqrestore(&ntp_lock, flags); + return ret; } @@ -375,14 +388,15 @@ u64 ntp_tick_length(void) static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) { enum hrtimer_restart res = HRTIMER_NORESTART; + unsigned long flags; + int leap = 0; - write_seqlock(&xtime_lock); - + spin_lock_irqsave(&ntp_lock, flags); switch (time_state) { case TIME_OK: break; case TIME_INS: - timekeeping_leap_insert(-1); + leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); @@ -390,7 +404,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) res = HRTIMER_RESTART; break; case TIME_DEL: - timekeeping_leap_insert(1); + leap = 1; time_tai--; time_state = TIME_WAIT; printk(KERN_NOTICE @@ -405,8 +419,14 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) time_state = TIME_OK; break; } + spin_unlock_irqrestore(&ntp_lock, flags); - write_sequnlock(&xtime_lock); + /* + * We have to call this outside of the ntp_lock to keep + * the proper locking hierarchy + */ + if (leap) + timekeeping_leap_insert(leap); return res; } @@ -422,6 +442,9 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) void second_overflow(void) { s64 delta; + unsigned long flags; + + spin_lock_irqsave(&ntp_lock, flags); /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -441,23 +464,25 @@ void second_overflow(void) pps_dec_valid(); if (!time_adjust) - return; + goto out; if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; tick_length += MAX_TICKADJ_SCALED; - return; + goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; - return; + goto out; } tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; +out: + spin_unlock_irqrestore(&ntp_lock, flags); } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -681,7 +706,7 @@ int do_adjtimex(struct timex *txc) getnstimeofday(&ts); - write_seqlock_irq(&xtime_lock); + spin_lock_irq(&ntp_lock); if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; @@ -723,7 +748,7 @@ int do_adjtimex(struct timex *txc) /* fill PPS status fields */ pps_fill_timex(txc); - write_sequnlock_irq(&xtime_lock); + spin_unlock_irq(&ntp_lock); txc->time.tv_sec = ts.tv_sec; txc->time.tv_usec = ts.tv_nsec; @@ -921,7 +946,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) pts_norm = pps_normalize_ts(*phase_ts); - write_seqlock_irqsave(&xtime_lock, flags); + spin_lock_irqsave(&ntp_lock, flags); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); @@ -934,7 +959,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; - write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock_irqrestore(&ntp_lock, flags); return; } @@ -949,7 +974,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock_irqrestore(&ntp_lock, flags); pr_err("hardpps: PPSJITTER: bad pulse\n"); return; } @@ -966,7 +991,7 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) hardpps_update_phase(pts_norm.nsec); - write_sequnlock_irqrestore(&xtime_lock, flags); + spin_unlock_irqrestore(&ntp_lock, flags); } EXPORT_SYMBOL(hardpps); -- cgit v1.2.3 From 92c1d3ed4dc0b8cfb10e85ed0c9934db41efc027 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 14 Nov 2011 14:05:44 -0800 Subject: time: Remove most of xtime_lock usage in timekeeping.c Now that ntp.c's locking is reworked, we can remove most of the xtime_lock usage in timekeeping.c The remaining xtime_lock presence is really for jiffies access and the global load calculation. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cdae24655c8d..74bb5701e0b3 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -172,7 +172,6 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } -/* must hold xtime_lock */ void timekeeping_leap_insert(int leapsecond) { unsigned long flags; @@ -372,13 +371,12 @@ EXPORT_SYMBOL(do_gettimeofday); int do_settimeofday(const struct timespec *tv) { struct timespec ts_delta; - unsigned long flags1,flags2; + unsigned long flags; if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags1); - write_seqlock_irqsave(&timekeeper.lock, flags2); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); @@ -395,8 +393,7 @@ int do_settimeofday(const struct timespec *tv) update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&timekeeper.lock, flags2); - write_sequnlock_irqrestore(&xtime_lock, flags1); + write_sequnlock_irqrestore(&timekeeper.lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -415,13 +412,12 @@ EXPORT_SYMBOL(do_settimeofday); */ int timekeeping_inject_offset(struct timespec *ts) { - unsigned long flags1,flags2; + unsigned long flags; if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) return -EINVAL; - write_seqlock_irqsave(&xtime_lock, flags1); - write_seqlock_irqsave(&timekeeper.lock, flags2); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); @@ -435,8 +431,7 @@ int timekeeping_inject_offset(struct timespec *ts) update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&timekeeper.lock, flags2); - write_sequnlock_irqrestore(&xtime_lock, flags1); + write_sequnlock_irqrestore(&timekeeper.lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -598,9 +593,7 @@ void __init timekeeping_init(void) seqlock_init(&timekeeper.lock); - write_seqlock_irqsave(&xtime_lock, flags); ntp_init(); - write_sequnlock_irqrestore(&xtime_lock, flags); write_seqlock_irqsave(&timekeeper.lock, flags); clock = clocksource_default_clock(); @@ -661,7 +654,7 @@ static void __timekeeping_inject_sleeptime(struct timespec *delta) */ void timekeeping_inject_sleeptime(struct timespec *delta) { - unsigned long flags1,flags2; + unsigned long flags; struct timespec ts; /* Make sure we don't set the clock twice */ @@ -669,8 +662,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) return; - write_seqlock_irqsave(&xtime_lock, flags1); - write_seqlock_irqsave(&timekeeper.lock, flags2); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); @@ -681,8 +673,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, timekeeper.clock, timekeeper.mult); - write_sequnlock_irqrestore(&timekeeper.lock, flags2); - write_sequnlock_irqrestore(&xtime_lock, flags1); + write_sequnlock_irqrestore(&timekeeper.lock, flags); /* signal hrtimers about time change */ clock_was_set(); @@ -698,15 +689,14 @@ void timekeeping_inject_sleeptime(struct timespec *delta) */ static void timekeeping_resume(void) { - unsigned long flags1,flags2; + unsigned long flags; struct timespec ts; read_persistent_clock(&ts); clocksource_resume(); - write_seqlock_irqsave(&xtime_lock, flags1); - write_seqlock_irqsave(&timekeeper.lock, flags2); + write_seqlock_irqsave(&timekeeper.lock, flags); if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { ts = timespec_sub(ts, timekeeping_suspend_time); @@ -716,8 +706,7 @@ static void timekeeping_resume(void) timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); timekeeper.ntp_error = 0; timekeeping_suspended = 0; - write_sequnlock_irqrestore(&timekeeper.lock, flags2); - write_sequnlock_irqrestore(&xtime_lock, flags1); + write_sequnlock_irqrestore(&timekeeper.lock, flags); touch_softlockup_watchdog(); @@ -729,14 +718,13 @@ static void timekeeping_resume(void) static int timekeeping_suspend(void) { - unsigned long flags1,flags2; + unsigned long flags; struct timespec delta, delta_delta; static struct timespec old_delta; read_persistent_clock(&timekeeping_suspend_time); - write_seqlock_irqsave(&xtime_lock, flags1); - write_seqlock_irqsave(&timekeeper.lock, flags2); + write_seqlock_irqsave(&timekeeper.lock, flags); timekeeping_forward_now(); timekeeping_suspended = 1; @@ -759,8 +747,7 @@ static int timekeeping_suspend(void) timekeeping_suspend_time = timespec_add(timekeeping_suspend_time, delta_delta); } - write_sequnlock_irqrestore(&timekeeper.lock, flags2); - write_sequnlock_irqrestore(&xtime_lock, flags1); + write_sequnlock_irqrestore(&timekeeper.lock, flags); clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); clocksource_suspend(); @@ -1006,7 +993,6 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) /** * update_wall_time - Uses the current clocksource to increment the wall time * - * Called from the timer interrupt, must hold a write on xtime_lock. */ static void update_wall_time(void) { -- cgit v1.2.3 From 058892e632aa53be8255c2f0a42f9ace7bed66bb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 Nov 2011 23:19:48 +0000 Subject: time: Reorder so the hot data is together Keep all the interesting data in a single cache line. CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 74bb5701e0b3..06f40ae13b7b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -25,6 +25,8 @@ struct timekeeper { /* Current clocksource used for timekeeping. */ struct clocksource *clock; + /* NTP adjusted clock multiplier */ + u32 mult; /* The shift value of the current clocksource. */ int shift; @@ -45,8 +47,6 @@ struct timekeeper { /* Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. */ int ntp_error_shift; - /* NTP adjusted clock multiplier */ - u32 mult; /* The current time */ struct timespec xtime; -- cgit v1.2.3 From cc06268c6a87db156af2daed6e96a936b955cc82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 13 Nov 2011 23:19:49 +0000 Subject: time: Move common updates to a function CC: Thomas Gleixner CC: Eric Dumazet CC: Richard Cochran Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 06f40ae13b7b..403c2a092830 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -172,17 +172,26 @@ static inline s64 timekeeping_get_ns_raw(void) return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); } +/* must hold write on timekeeper.lock */ +static void timekeeping_update(bool clearntp) +{ + if (clearntp) { + timekeeper.ntp_error = 0; + ntp_clear(); + } + update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, + timekeeper.clock, timekeeper.mult); +} + + void timekeeping_leap_insert(int leapsecond) { unsigned long flags; write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeper.xtime.tv_sec += leapsecond; timekeeper.wall_to_monotonic.tv_sec -= leapsecond; - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); - + timekeeping_update(false); write_sequnlock_irqrestore(&timekeeper.lock, flags); } @@ -386,12 +395,7 @@ int do_settimeofday(const struct timespec *tv) timespec_sub(timekeeper.wall_to_monotonic, ts_delta); timekeeper.xtime = *tv; - - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + timekeeping_update(true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -425,11 +429,7 @@ int timekeeping_inject_offset(struct timespec *ts) timekeeper.wall_to_monotonic = timespec_sub(timekeeper.wall_to_monotonic, *ts); - timekeeper.ntp_error = 0; - ntp_clear(); - - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + timekeeping_update(true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -668,10 +668,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta) __timekeeping_inject_sleeptime(delta); - timekeeper.ntp_error = 0; - ntp_clear(); - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + timekeeping_update(true); write_sequnlock_irqrestore(&timekeeper.lock, flags); @@ -1083,9 +1080,7 @@ static void update_wall_time(void) second_overflow(); } - /* check to see if there is a new clocksource to use */ - update_vsyscall(&timekeeper.xtime, &timekeeper.wall_to_monotonic, - timekeeper.clock, timekeeper.mult); + timekeeping_update(false); out: write_sequnlock_irqrestore(&timekeeper.lock, flags); -- cgit v1.2.3 From 39be350127ec60a078edffe5b4915dafba4ba514 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Jan 2012 12:44:34 +0100 Subject: sched, block: Unify cache detection The block layer has some code trying to determine if two CPUs share a cache, the scheduler has a similar function. Expose the function used by the scheduler and make the block layer use it, thereby removing the block layers usage of CONFIG_SCHED* and topology bits. Signed-off-by: Peter Zijlstra Acked-by: Jens Axboe Link: http://lkml.kernel.org/r/1327579450.2446.95.camel@twins --- block/blk-softirq.c | 16 ++++++++-------- block/blk.h | 16 ---------------- include/linux/sched.h | 8 ++++++++ kernel/sched/core.c | 6 +++--- 4 files changed, 19 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 1366a89d8e66..467c8de88642 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "blk.h" @@ -103,9 +104,10 @@ static struct notifier_block __cpuinitdata blk_cpu_notifier = { void __blk_complete_request(struct request *req) { - int ccpu, cpu, group_cpu = NR_CPUS; + int ccpu, cpu; struct request_queue *q = req->q; unsigned long flags; + bool shared = false; BUG_ON(!q->softirq_done_fn); @@ -117,22 +119,20 @@ void __blk_complete_request(struct request *req) */ if (req->cpu != -1) { ccpu = req->cpu; - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { - ccpu = blk_cpu_to_group(ccpu); - group_cpu = blk_cpu_to_group(cpu); - } + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) + shared = cpus_share_cache(cpu, ccpu); } else ccpu = cpu; /* - * If current CPU and requested CPU are in the same group, running - * softirq in current CPU. One might concern this is just like + * If current CPU and requested CPU share a cache, run the softirq on + * the current CPU. One might concern this is just like * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is * running in interrupt handler, and currently I/O controller doesn't * support multiple interrupts, so current CPU is unique actually. This * avoids IPI sending from current CPU to the first CPU of a group. */ - if (ccpu == cpu || ccpu == group_cpu) { + if (ccpu == cpu || shared) { struct list_head *list; do_local: list = &__get_cpu_var(blk_cpu_done); diff --git a/block/blk.h b/block/blk.h index 7efd772336de..df5b59acbdff 100644 --- a/block/blk.h +++ b/block/blk.h @@ -164,22 +164,6 @@ static inline int queue_congestion_off_threshold(struct request_queue *q) return q->nr_congestion_off; } -static inline int blk_cpu_to_group(int cpu) -{ - int group = NR_CPUS; -#ifdef CONFIG_SCHED_MC - const struct cpumask *mask = cpu_coregroup_mask(cpu); - group = cpumask_first(mask); -#elif defined(CONFIG_SCHED_SMT) - group = cpumask_first(topology_thread_cpumask(cpu)); -#else - return cpu; -#endif - if (likely(group < NR_CPUS)) - return group; - return cpu; -} - /* * Contribute to IO statistics IFF: * diff --git a/include/linux/sched.h b/include/linux/sched.h index 513f52459872..0e1959568836 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1052,6 +1052,8 @@ static inline int test_sd_parent(struct sched_domain *sd, int flag) unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu); unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu); +bool cpus_share_cache(int this_cpu, int that_cpu); + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -1061,6 +1063,12 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], struct sched_domain_attr *dattr_new) { } + +static inline bool cpus_share_cache(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..d7c43227311d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1507,7 +1507,7 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags) } #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ -static inline int ttwu_share_cache(int this_cpu, int that_cpu) +bool cpus_share_cache(int this_cpu, int that_cpu) { return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } @@ -1518,7 +1518,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) struct rq *rq = cpu_rq(cpu); #if defined(CONFIG_SMP) - if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) { + if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { sched_clock_cpu(cpu); /* sync clocks x-cpu */ ttwu_queue_remote(p, cpu); return; @@ -5754,7 +5754,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * * Also keep a unique ID per domain (we use the first cpu number in * the cpumask of the domain), this allows us to quickly tell if - * two cpus are in the same cache domain, see ttwu_share_cache(). + * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_id); -- cgit v1.2.3 From 4ec4412e1e91f44a3dcb97b6c9172a13fc78bac9 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 12 Dec 2011 20:21:08 +0100 Subject: sched: Ensure cpu_power periodic update With a lot of small tasks, the softirq sched is nearly never called when no_hz is enabled. In this case load_balance() is mainly called with the newly_idle mode which doesn't update the cpu_power. Add a next_update field which ensure a maximum update period when there is short activity. Having stale cpu_power information can skew the load-balancing decisions, this is cured by the guaranteed update. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1323717668-2143-1-git-send-email-vincent.guittot@linaro.org --- include/linux/sched.h | 1 + kernel/sched/fair.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e1959568836..92313a3f6f77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -905,6 +905,7 @@ struct sched_group_power { * single CPU. */ unsigned int power, power_orig; + unsigned long next_update; /* * Number of busy cpus in this group. */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..8e77a6bd597b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -215,6 +215,8 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, const struct sched_class fair_sched_class; +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /************************************************************** * CFS operations on generic schedulable entities: */ @@ -3776,6 +3778,11 @@ void update_group_power(struct sched_domain *sd, int cpu) struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power; + unsigned long interval; + + interval = msecs_to_jiffies(sd->balance_interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + sdg->sgp->next_update = jiffies + interval; if (!child) { update_cpu_power(sd, cpu); @@ -3883,12 +3890,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. */ - if (idle != CPU_NEWLY_IDLE && local_group) { - if (balance_cpu != this_cpu) { - *balance = 0; - return; - } - update_group_power(sd, this_cpu); + if (local_group) { + if (idle != CPU_NEWLY_IDLE) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } else if (time_after_eq(jiffies, group->sgp->next_update)) + update_group_power(sd, this_cpu); } /* Adjust by relative CPU power of the group */ @@ -4945,8 +4955,6 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, static DEFINE_SPINLOCK(balancing); -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /* * Scale the max load_balance interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. -- cgit v1.2.3 From 30fd049afcfed50e022704036e8629d6bdfe84e6 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Tue, 24 Jan 2012 22:33:56 +0600 Subject: sched: Remove sched_switch Currently we don't utilize the sched_switch field anymore. But, simply removing sched_switch field from the middle of the sched_stat output will break tools. So, to stay compatible we hardcode it to zero and remove the field from the scheduler data structures. Update the schedstat documentation accordingly. Signed-off-by: Rakib Mullick Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1327422836.27181.5.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-stats.txt | 3 ++- kernel/sched/debug.c | 1 - kernel/sched/sched.h | 1 - kernel/sched/stats.c | 4 ++-- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/scheduler/sched-stats.txt b/Documentation/scheduler/sched-stats.txt index 1cd5d51bc761..8259b34a66ae 100644 --- a/Documentation/scheduler/sched-stats.txt +++ b/Documentation/scheduler/sched-stats.txt @@ -38,7 +38,8 @@ First field is a sched_yield() statistic: 1) # of times sched_yield() was called Next three are schedule() statistics: - 2) # of times we switched to the expired queue and reused it + 2) This field is a legacy array expiration count field used in the O(1) + scheduler. We kept it for ABI compatibility, but it is always set to zero. 3) # of times schedule() was called 4) # of times schedule() left the processor idle diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2a075e10004b..09acaa15161d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -288,7 +288,6 @@ static void print_cpu(struct seq_file *m, int cpu) P(yld_count); - P(sched_switch); P(sched_count); P(sched_goidle); #ifdef CONFIG_SMP diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..8a2c768d2f98 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -462,7 +462,6 @@ struct rq { unsigned int yld_count; /* schedule() stats */ - unsigned int sched_switch; unsigned int sched_count; unsigned int sched_goidle; diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 2a581ba8e190..903ffa9e8872 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -32,9 +32,9 @@ static int show_schedstat(struct seq_file *seq, void *v) /* runqueue-specific stats */ seq_printf(seq, - "cpu%d %u %u %u %u %u %u %llu %llu %lu", + "cpu%d %u 0 %u %u %u %u %llu %llu %lu", cpu, rq->yld_count, - rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->sched_count, rq->sched_goidle, rq->ttwu_count, rq->ttwu_local, rq->rq_cpu_time, rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); -- cgit v1.2.3 From cf579dfb82550e34de7ccf3ef090d8b834ccd3a9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 29 Jan 2012 20:38:29 +0100 Subject: PM / Sleep: Introduce "late suspend" and "early resume" of devices The current device suspend/resume phases during system-wide power transitions appear to be insufficient for some platforms that want to use the same callback routines for saving device states and related operations during runtime suspend/resume as well as during system suspend/resume. In principle, they could point their .suspend_noirq() and .resume_noirq() to the same callback routines as their .runtime_suspend() and .runtime_resume(), respectively, but at least some of them require device interrupts to be enabled while the code in those routines is running. It also makes sense to have device suspend-resume callbacks that will be executed with runtime PM disabled and with device interrupts enabled in case someone needs to run some special code in that context during system-wide power transitions. Apart from this, .suspend_noirq() and .resume_noirq() were introduced as a workaround for drivers using shared interrupts and failing to prevent their interrupt handlers from accessing suspended hardware. It appears to be better not to use them for other porposes, or we may have to deal with some serious confusion (which seems to be happening already). For the above reasons, introduce new device suspend/resume phases, "late suspend" and "early resume" (and analogously for hibernation) whose callback will be executed with runtime PM disabled and with device interrupts enabled and whose callback pointers generally may point to runtime suspend/resume routines. Signed-off-by: Rafael J. Wysocki Reviewed-by: Mark Brown Reviewed-by: Kevin Hilman --- Documentation/power/devices.txt | 93 +++++++++------ arch/x86/kernel/apm_32.c | 11 +- drivers/base/power/main.c | 247 ++++++++++++++++++++++++++++++++++++---- drivers/xen/manage.c | 6 +- include/linux/pm.h | 43 +++++-- include/linux/suspend.h | 4 + kernel/kexec.c | 8 +- kernel/power/hibernate.c | 24 ++-- kernel/power/main.c | 8 +- kernel/power/suspend.c | 4 +- 10 files changed, 357 insertions(+), 91 deletions(-) (limited to 'kernel') diff --git a/Documentation/power/devices.txt b/Documentation/power/devices.txt index 20af7def23c8..872815cd41d3 100644 --- a/Documentation/power/devices.txt +++ b/Documentation/power/devices.txt @@ -96,6 +96,12 @@ struct dev_pm_ops { int (*thaw)(struct device *dev); int (*poweroff)(struct device *dev); int (*restore)(struct device *dev); + int (*suspend_late)(struct device *dev); + int (*resume_early)(struct device *dev); + int (*freeze_late)(struct device *dev); + int (*thaw_early)(struct device *dev); + int (*poweroff_late)(struct device *dev); + int (*restore_early)(struct device *dev); int (*suspend_noirq)(struct device *dev); int (*resume_noirq)(struct device *dev); int (*freeze_noirq)(struct device *dev); @@ -305,7 +311,7 @@ Entering System Suspend ----------------------- When the system goes into the standby or memory sleep state, the phases are: - prepare, suspend, suspend_noirq. + prepare, suspend, suspend_late, suspend_noirq. 1. The prepare phase is meant to prevent races by preventing new devices from being registered; the PM core would never know that all the @@ -324,7 +330,12 @@ When the system goes into the standby or memory sleep state, the phases are: appropriate low-power state, depending on the bus type the device is on, and they may enable wakeup events. - 3. The suspend_noirq phase occurs after IRQ handlers have been disabled, + 3 For a number of devices it is convenient to split suspend into the + "quiesce device" and "save device state" phases, in which cases + suspend_late is meant to do the latter. It is always executed after + runtime power management has been disabled for all devices. + + 4. The suspend_noirq phase occurs after IRQ handlers have been disabled, which means that the driver's interrupt handler will not be called while the callback method is running. The methods should save the values of the device's registers that weren't saved previously and finally put the @@ -359,7 +370,7 @@ Leaving System Suspend ---------------------- When resuming from standby or memory sleep, the phases are: - resume_noirq, resume, complete. + resume_noirq, resume_early, resume, complete. 1. The resume_noirq callback methods should perform any actions needed before the driver's interrupt handlers are invoked. This generally @@ -375,14 +386,18 @@ When resuming from standby or memory sleep, the phases are: device driver's ->pm.resume_noirq() method to perform device-specific actions. - 2. The resume methods should bring the the device back to its operating + 2. The resume_early methods should prepare devices for the execution of + the resume methods. This generally involves undoing the actions of the + preceding suspend_late phase. + + 3 The resume methods should bring the the device back to its operating state, so that it can perform normal I/O. This generally involves undoing the actions of the suspend phase. - 3. The complete phase uses only a bus callback. The method should undo the - actions of the prepare phase. Note, however, that new children may be - registered below the device as soon as the resume callbacks occur; it's - not necessary to wait until the complete phase. + 4. The complete phase should undo the actions of the prepare phase. Note, + however, that new children may be registered below the device as soon as + the resume callbacks occur; it's not necessary to wait until the + complete phase. At the end of these phases, drivers should be as functional as they were before suspending: I/O can be performed using DMA and IRQs, and the relevant clocks are @@ -429,8 +444,8 @@ an image of the system memory while everything is stable, reactivate all devices (thaw), write the image to permanent storage, and finally shut down the system (poweroff). The phases used to accomplish this are: - prepare, freeze, freeze_noirq, thaw_noirq, thaw, complete, - prepare, poweroff, poweroff_noirq + prepare, freeze, freeze_late, freeze_noirq, thaw_noirq, thaw_early, + thaw, complete, prepare, poweroff, poweroff_late, poweroff_noirq 1. The prepare phase is discussed in the "Entering System Suspend" section above. @@ -441,7 +456,11 @@ system (poweroff). The phases used to accomplish this are: save time it's best not to do so. Also, the device should not be prepared to generate wakeup events. - 3. The freeze_noirq phase is analogous to the suspend_noirq phase discussed + 3. The freeze_late phase is analogous to the suspend_late phase described + above, except that the device should not be put in a low-power state and + should not be allowed to generate wakeup events by it. + + 4. The freeze_noirq phase is analogous to the suspend_noirq phase discussed above, except again that the device should not be put in a low-power state and should not be allowed to generate wakeup events. @@ -449,15 +468,19 @@ At this point the system image is created. All devices should be inactive and the contents of memory should remain undisturbed while this happens, so that the image forms an atomic snapshot of the system state. - 4. The thaw_noirq phase is analogous to the resume_noirq phase discussed + 5. The thaw_noirq phase is analogous to the resume_noirq phase discussed above. The main difference is that its methods can assume the device is in the same state as at the end of the freeze_noirq phase. - 5. The thaw phase is analogous to the resume phase discussed above. Its + 6. The thaw_early phase is analogous to the resume_early phase described + above. Its methods should undo the actions of the preceding + freeze_late, if necessary. + + 7. The thaw phase is analogous to the resume phase discussed above. Its methods should bring the device back to an operating state, so that it can be used for saving the image if necessary. - 6. The complete phase is discussed in the "Leaving System Suspend" section + 8. The complete phase is discussed in the "Leaving System Suspend" section above. At this point the system image is saved, and the devices then need to be @@ -465,16 +488,19 @@ prepared for the upcoming system shutdown. This is much like suspending them before putting the system into the standby or memory sleep state, and the phases are similar. - 7. The prepare phase is discussed above. + 9. The prepare phase is discussed above. + + 10. The poweroff phase is analogous to the suspend phase. - 8. The poweroff phase is analogous to the suspend phase. + 11. The poweroff_late phase is analogous to the suspend_late phase. - 9. The poweroff_noirq phase is analogous to the suspend_noirq phase. + 12. The poweroff_noirq phase is analogous to the suspend_noirq phase. -The poweroff and poweroff_noirq callbacks should do essentially the same things -as the suspend and suspend_noirq callbacks. The only notable difference is that -they need not store the device register values, because the registers should -already have been stored during the freeze or freeze_noirq phases. +The poweroff, poweroff_late and poweroff_noirq callbacks should do essentially +the same things as the suspend, suspend_late and suspend_noirq callbacks, +respectively. The only notable difference is that they need not store the +device register values, because the registers should already have been stored +during the freeze, freeze_late or freeze_noirq phases. Leaving Hibernation @@ -518,22 +544,25 @@ To achieve this, the image kernel must restore the devices' pre-hibernation functionality. The operation is much like waking up from the memory sleep state, although it involves different phases: - restore_noirq, restore, complete + restore_noirq, restore_early, restore, complete 1. The restore_noirq phase is analogous to the resume_noirq phase. - 2. The restore phase is analogous to the resume phase. + 2. The restore_early phase is analogous to the resume_early phase. + + 3. The restore phase is analogous to the resume phase. - 3. The complete phase is discussed above. + 4. The complete phase is discussed above. -The main difference from resume[_noirq] is that restore[_noirq] must assume the -device has been accessed and reconfigured by the boot loader or the boot kernel. -Consequently the state of the device may be different from the state remembered -from the freeze and freeze_noirq phases. The device may even need to be reset -and completely re-initialized. In many cases this difference doesn't matter, so -the resume[_noirq] and restore[_norq] method pointers can be set to the same -routines. Nevertheless, different callback pointers are used in case there is a -situation where it actually matters. +The main difference from resume[_early|_noirq] is that restore[_early|_noirq] +must assume the device has been accessed and reconfigured by the boot loader or +the boot kernel. Consequently the state of the device may be different from the +state remembered from the freeze, freeze_late and freeze_noirq phases. The +device may even need to be reset and completely re-initialized. In many cases +this difference doesn't matter, so the resume[_early|_noirq] and +restore[_early|_norq] method pointers can be set to the same routines. +Nevertheless, different callback pointers are used in case there is a situation +where it actually does matter. Device Power Management Domains diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f76623cbe263..5d56931a15b3 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -1234,8 +1234,7 @@ static int suspend(int vetoable) struct apm_user *as; dpm_suspend_start(PMSG_SUSPEND); - - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1259,9 +1258,9 @@ static int suspend(int vetoable) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); - + dpm_resume_start(PMSG_RESUME); dpm_resume_end(PMSG_RESUME); + queue_event(APM_NORMAL_RESUME, NULL); spin_lock(&user_list_lock); for (as = user_list; as != NULL; as = as->next) { @@ -1277,7 +1276,7 @@ static void standby(void) { int err; - dpm_suspend_noirq(PMSG_SUSPEND); + dpm_suspend_end(PMSG_SUSPEND); local_irq_disable(); syscore_suspend(); @@ -1291,7 +1290,7 @@ static void standby(void) syscore_resume(); local_irq_enable(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); } static apm_event_t get_event(void) diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index e2cc3d2e0ecc..b462c0e341cb 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -47,6 +47,7 @@ typedef int (*pm_callback_t)(struct device *); LIST_HEAD(dpm_list); LIST_HEAD(dpm_prepared_list); LIST_HEAD(dpm_suspended_list); +LIST_HEAD(dpm_late_early_list); LIST_HEAD(dpm_noirq_list); struct suspend_stats suspend_stats; @@ -245,6 +246,40 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) return NULL; } +/** + * pm_late_early_op - Return the PM operation appropriate for given PM event. + * @ops: PM operations to choose from. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, + pm_message_t state) +{ + switch (state.event) { +#ifdef CONFIG_SUSPEND + case PM_EVENT_SUSPEND: + return ops->suspend_late; + case PM_EVENT_RESUME: + return ops->resume_early; +#endif /* CONFIG_SUSPEND */ +#ifdef CONFIG_HIBERNATE_CALLBACKS + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return ops->freeze_late; + case PM_EVENT_HIBERNATE: + return ops->poweroff_late; + case PM_EVENT_THAW: + case PM_EVENT_RECOVER: + return ops->thaw_early; + case PM_EVENT_RESTORE: + return ops->restore_early; +#endif /* CONFIG_HIBERNATE_CALLBACKS */ + } + + return NULL; +} + /** * pm_noirq_op - Return the PM operation appropriate for given PM event. * @ops: PM operations to choose from. @@ -374,21 +409,21 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) TRACE_RESUME(0); if (dev->pm_domain) { - info = "EARLY power domain "; + info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - info = "EARLY type "; + info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - info = "EARLY class "; + info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "EARLY bus "; + info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (!callback && dev->driver && dev->driver->pm) { - info = "EARLY driver "; + info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } @@ -399,13 +434,13 @@ static int device_resume_noirq(struct device *dev, pm_message_t state) } /** - * dpm_resume_noirq - Execute "early resume" callbacks for non-sysdev devices. + * dpm_resume_noirq - Execute "noirq resume" callbacks for all devices. * @state: PM transition of the system being carried out. * - * Call the "noirq" resume handlers for all devices marked as DPM_OFF_IRQ and + * Call the "noirq" resume handlers for all devices in dpm_noirq_list and * enable device drivers to receive interrupts. */ -void dpm_resume_noirq(pm_message_t state) +static void dpm_resume_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); @@ -415,7 +450,7 @@ void dpm_resume_noirq(pm_message_t state) int error; get_device(dev); - list_move_tail(&dev->power.entry, &dpm_suspended_list); + list_move_tail(&dev->power.entry, &dpm_late_early_list); mutex_unlock(&dpm_list_mtx); error = device_resume_noirq(dev, state); @@ -423,6 +458,80 @@ void dpm_resume_noirq(pm_message_t state) suspend_stats.failed_resume_noirq++; dpm_save_failed_step(SUSPEND_RESUME_NOIRQ); dpm_save_failed_dev(dev_name(dev)); + pm_dev_err(dev, state, " noirq", error); + } + + mutex_lock(&dpm_list_mtx); + put_device(dev); + } + mutex_unlock(&dpm_list_mtx); + dpm_show_time(starttime, state, "noirq"); + resume_device_irqs(); +} + +/** + * device_resume_early - Execute an "early resume" callback for given device. + * @dev: Device to handle. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static int device_resume_early(struct device *dev, pm_message_t state) +{ + pm_callback_t callback = NULL; + char *info = NULL; + int error = 0; + + TRACE_DEVICE(dev); + TRACE_RESUME(0); + + if (dev->pm_domain) { + info = "early power domain "; + callback = pm_late_early_op(&dev->pm_domain->ops, state); + } else if (dev->type && dev->type->pm) { + info = "early type "; + callback = pm_late_early_op(dev->type->pm, state); + } else if (dev->class && dev->class->pm) { + info = "early class "; + callback = pm_late_early_op(dev->class->pm, state); + } else if (dev->bus && dev->bus->pm) { + info = "early bus "; + callback = pm_late_early_op(dev->bus->pm, state); + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "early driver "; + callback = pm_late_early_op(dev->driver->pm, state); + } + + error = dpm_run_callback(callback, dev, state, info); + + TRACE_RESUME(error); + return error; +} + +/** + * dpm_resume_early - Execute "early resume" callbacks for all devices. + * @state: PM transition of the system being carried out. + */ +static void dpm_resume_early(pm_message_t state) +{ + ktime_t starttime = ktime_get(); + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_late_early_list)) { + struct device *dev = to_device(dpm_late_early_list.next); + int error; + + get_device(dev); + list_move_tail(&dev->power.entry, &dpm_suspended_list); + mutex_unlock(&dpm_list_mtx); + + error = device_resume_early(dev, state); + if (error) { + suspend_stats.failed_resume_early++; + dpm_save_failed_step(SUSPEND_RESUME_EARLY); + dpm_save_failed_dev(dev_name(dev)); pm_dev_err(dev, state, " early", error); } @@ -431,9 +540,18 @@ void dpm_resume_noirq(pm_message_t state) } mutex_unlock(&dpm_list_mtx); dpm_show_time(starttime, state, "early"); - resume_device_irqs(); } -EXPORT_SYMBOL_GPL(dpm_resume_noirq); + +/** + * dpm_resume_start - Execute "noirq" and "early" device callbacks. + * @state: PM transition of the system being carried out. + */ +void dpm_resume_start(pm_message_t state) +{ + dpm_resume_noirq(state); + dpm_resume_early(state); +} +EXPORT_SYMBOL_GPL(dpm_resume_start); /** * device_resume - Execute "resume" callbacks for given device. @@ -716,21 +834,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) char *info = NULL; if (dev->pm_domain) { - info = "LATE power domain "; + info = "noirq power domain "; callback = pm_noirq_op(&dev->pm_domain->ops, state); } else if (dev->type && dev->type->pm) { - info = "LATE type "; + info = "noirq type "; callback = pm_noirq_op(dev->type->pm, state); } else if (dev->class && dev->class->pm) { - info = "LATE class "; + info = "noirq class "; callback = pm_noirq_op(dev->class->pm, state); } else if (dev->bus && dev->bus->pm) { - info = "LATE bus "; + info = "noirq bus "; callback = pm_noirq_op(dev->bus->pm, state); } if (!callback && dev->driver && dev->driver->pm) { - info = "LATE driver "; + info = "noirq driver "; callback = pm_noirq_op(dev->driver->pm, state); } @@ -738,21 +856,21 @@ static int device_suspend_noirq(struct device *dev, pm_message_t state) } /** - * dpm_suspend_noirq - Execute "late suspend" callbacks for non-sysdev devices. + * dpm_suspend_noirq - Execute "noirq suspend" callbacks for all devices. * @state: PM transition of the system being carried out. * * Prevent device drivers from receiving interrupts and call the "noirq" suspend * handlers for all non-sysdev devices. */ -int dpm_suspend_noirq(pm_message_t state) +static int dpm_suspend_noirq(pm_message_t state) { ktime_t starttime = ktime_get(); int error = 0; suspend_device_irqs(); mutex_lock(&dpm_list_mtx); - while (!list_empty(&dpm_suspended_list)) { - struct device *dev = to_device(dpm_suspended_list.prev); + while (!list_empty(&dpm_late_early_list)) { + struct device *dev = to_device(dpm_late_early_list.prev); get_device(dev); mutex_unlock(&dpm_list_mtx); @@ -761,7 +879,7 @@ int dpm_suspend_noirq(pm_message_t state) mutex_lock(&dpm_list_mtx); if (error) { - pm_dev_err(dev, state, " late", error); + pm_dev_err(dev, state, " noirq", error); suspend_stats.failed_suspend_noirq++; dpm_save_failed_step(SUSPEND_SUSPEND_NOIRQ); dpm_save_failed_dev(dev_name(dev)); @@ -775,11 +893,96 @@ int dpm_suspend_noirq(pm_message_t state) mutex_unlock(&dpm_list_mtx); if (error) dpm_resume_noirq(resume_event(state)); + else + dpm_show_time(starttime, state, "noirq"); + return error; +} + +/** + * device_suspend_late - Execute a "late suspend" callback for given device. + * @dev: Device to handle. + * @state: PM transition of the system being carried out. + * + * Runtime PM is disabled for @dev while this function is being executed. + */ +static int device_suspend_late(struct device *dev, pm_message_t state) +{ + pm_callback_t callback = NULL; + char *info = NULL; + + if (dev->pm_domain) { + info = "late power domain "; + callback = pm_late_early_op(&dev->pm_domain->ops, state); + } else if (dev->type && dev->type->pm) { + info = "late type "; + callback = pm_late_early_op(dev->type->pm, state); + } else if (dev->class && dev->class->pm) { + info = "late class "; + callback = pm_late_early_op(dev->class->pm, state); + } else if (dev->bus && dev->bus->pm) { + info = "late bus "; + callback = pm_late_early_op(dev->bus->pm, state); + } + + if (!callback && dev->driver && dev->driver->pm) { + info = "late driver "; + callback = pm_late_early_op(dev->driver->pm, state); + } + + return dpm_run_callback(callback, dev, state, info); +} + +/** + * dpm_suspend_late - Execute "late suspend" callbacks for all devices. + * @state: PM transition of the system being carried out. + */ +static int dpm_suspend_late(pm_message_t state) +{ + ktime_t starttime = ktime_get(); + int error = 0; + + mutex_lock(&dpm_list_mtx); + while (!list_empty(&dpm_suspended_list)) { + struct device *dev = to_device(dpm_suspended_list.prev); + + get_device(dev); + mutex_unlock(&dpm_list_mtx); + + error = device_suspend_late(dev, state); + + mutex_lock(&dpm_list_mtx); + if (error) { + pm_dev_err(dev, state, " late", error); + suspend_stats.failed_suspend_late++; + dpm_save_failed_step(SUSPEND_SUSPEND_LATE); + dpm_save_failed_dev(dev_name(dev)); + put_device(dev); + break; + } + if (!list_empty(&dev->power.entry)) + list_move(&dev->power.entry, &dpm_late_early_list); + put_device(dev); + } + mutex_unlock(&dpm_list_mtx); + if (error) + dpm_resume_early(resume_event(state)); else dpm_show_time(starttime, state, "late"); + return error; } -EXPORT_SYMBOL_GPL(dpm_suspend_noirq); + +/** + * dpm_suspend_end - Execute "late" and "noirq" device suspend callbacks. + * @state: PM transition of the system being carried out. + */ +int dpm_suspend_end(pm_message_t state) +{ + int error = dpm_suspend_late(state); + + return error ? : dpm_suspend_noirq(state); +} +EXPORT_SYMBOL_GPL(dpm_suspend_end); /** * legacy_suspend - Execute a legacy (bus or class) suspend callback for device. diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index ce4fa0831860..9e14ae6cd49c 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -129,9 +129,9 @@ static void do_suspend(void) printk(KERN_DEBUG "suspending xenstore...\n"); xs_suspend(); - err = dpm_suspend_noirq(PMSG_FREEZE); + err = dpm_suspend_end(PMSG_FREEZE); if (err) { - printk(KERN_ERR "dpm_suspend_noirq failed: %d\n", err); + printk(KERN_ERR "dpm_suspend_end failed: %d\n", err); goto out_resume; } @@ -149,7 +149,7 @@ static void do_suspend(void) err = stop_machine(xen_suspend, &si, cpumask_of(0)); - dpm_resume_noirq(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); if (err) { printk(KERN_ERR "failed to start xen_suspend: %d\n", err); diff --git a/include/linux/pm.h b/include/linux/pm.h index e4982ac3fbbc..c68e1f22ac95 100644 --- a/include/linux/pm.h +++ b/include/linux/pm.h @@ -110,6 +110,10 @@ typedef struct pm_message { * Subsystem-level @suspend() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @suspend_late: Continue operations started by @suspend(). For a number of + * devices @suspend_late() may point to the same callback routine as the + * runtime suspend callback. + * * @resume: Executed after waking the system up from a sleep state in which the * contents of main memory were preserved. The exact action to perform * depends on the device's subsystem, but generally the driver is expected @@ -122,6 +126,10 @@ typedef struct pm_message { * Subsystem-level @resume() is executed for all devices after invoking * subsystem-level @resume_noirq() for all of them. * + * @resume_early: Prepare to execute @resume(). For a number of devices + * @resume_early() may point to the same callback routine as the runtime + * resume callback. + * * @freeze: Hibernation-specific, executed before creating a hibernation image. * Analogous to @suspend(), but it should not enable the device to signal * wakeup events or change its power state. The majority of subsystems @@ -131,6 +139,10 @@ typedef struct pm_message { * Subsystem-level @freeze() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @freeze_late: Continue operations started by @freeze(). Analogous to + * @suspend_late(), but it should not enable the device to signal wakeup + * events or change its power state. + * * @thaw: Hibernation-specific, executed after creating a hibernation image OR * if the creation of an image has failed. Also executed after a failing * attempt to restore the contents of main memory from such an image. @@ -140,15 +152,23 @@ typedef struct pm_message { * subsystem-level @thaw_noirq() for all of them. It also may be executed * directly after @freeze() in case of a transition error. * + * @thaw_early: Prepare to execute @thaw(). Undo the changes made by the + * preceding @freeze_late(). + * * @poweroff: Hibernation-specific, executed after saving a hibernation image. * Analogous to @suspend(), but it need not save the device's settings in * memory. * Subsystem-level @poweroff() is executed for all devices after invoking * subsystem-level @prepare() for all of them. * + * @poweroff_late: Continue operations started by @poweroff(). Analogous to + * @suspend_late(), but it need not save the device's settings in memory. + * * @restore: Hibernation-specific, executed after restoring the contents of main * memory from a hibernation image, analogous to @resume(). * + * @restore_early: Prepare to execute @restore(), analogous to @resume_early(). + * * @suspend_noirq: Complete the actions started by @suspend(). Carry out any * additional operations required for suspending the device that might be * racing with its driver's interrupt handler, which is guaranteed not to @@ -158,9 +178,10 @@ typedef struct pm_message { * @suspend_noirq() has returned successfully. If the device can generate * system wakeup signals and is enabled to wake up the system, it should be * configured to do so at that time. However, depending on the platform - * and device's subsystem, @suspend() may be allowed to put the device into - * the low-power state and configure it to generate wakeup signals, in - * which case it generally is not necessary to define @suspend_noirq(). + * and device's subsystem, @suspend() or @suspend_late() may be allowed to + * put the device into the low-power state and configure it to generate + * wakeup signals, in which case it generally is not necessary to define + * @suspend_noirq(). * * @resume_noirq: Prepare for the execution of @resume() by carrying out any * operations required for resuming the device that might be racing with @@ -171,9 +192,9 @@ typedef struct pm_message { * additional operations required for freezing the device that might be * racing with its driver's interrupt handler, which is guaranteed not to * run while @freeze_noirq() is being executed. - * The power state of the device should not be changed by either @freeze() - * or @freeze_noirq() and it should not be configured to signal system - * wakeup by any of these callbacks. + * The power state of the device should not be changed by either @freeze(), + * or @freeze_late(), or @freeze_noirq() and it should not be configured to + * signal system wakeup by any of these callbacks. * * @thaw_noirq: Prepare for the execution of @thaw() by carrying out any * operations required for thawing the device that might be racing with its @@ -249,6 +270,12 @@ struct dev_pm_ops { int (*thaw)(struct device *dev); int (*poweroff)(struct device *dev); int (*restore)(struct device *dev); + int (*suspend_late)(struct device *dev); + int (*resume_early)(struct device *dev); + int (*freeze_late)(struct device *dev); + int (*thaw_early)(struct device *dev); + int (*poweroff_late)(struct device *dev); + int (*restore_early)(struct device *dev); int (*suspend_noirq)(struct device *dev); int (*resume_noirq)(struct device *dev); int (*freeze_noirq)(struct device *dev); @@ -584,13 +611,13 @@ struct dev_pm_domain { #ifdef CONFIG_PM_SLEEP extern void device_pm_lock(void); -extern void dpm_resume_noirq(pm_message_t state); +extern void dpm_resume_start(pm_message_t state); extern void dpm_resume_end(pm_message_t state); extern void dpm_resume(pm_message_t state); extern void dpm_complete(pm_message_t state); extern void device_pm_unlock(void); -extern int dpm_suspend_noirq(pm_message_t state); +extern int dpm_suspend_end(pm_message_t state); extern int dpm_suspend_start(pm_message_t state); extern int dpm_suspend(pm_message_t state); extern int dpm_prepare(pm_message_t state); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 91784a4f8608..ac1c114c499d 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -42,8 +42,10 @@ enum suspend_stat_step { SUSPEND_FREEZE = 1, SUSPEND_PREPARE, SUSPEND_SUSPEND, + SUSPEND_SUSPEND_LATE, SUSPEND_SUSPEND_NOIRQ, SUSPEND_RESUME_NOIRQ, + SUSPEND_RESUME_EARLY, SUSPEND_RESUME }; @@ -53,8 +55,10 @@ struct suspend_stats { int failed_freeze; int failed_prepare; int failed_suspend; + int failed_suspend_late; int failed_suspend_noirq; int failed_resume; + int failed_resume_early; int failed_resume_noirq; #define REC_FAILED_NUM 2 int last_failed_dev; diff --git a/kernel/kexec.c b/kernel/kexec.c index 7b0886786701..a6a675cb9818 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1546,13 +1546,13 @@ int kernel_kexec(void) if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, - * but *not* dpm_suspend_noirq(). We *must* call - * dpm_suspend_noirq() now. Otherwise, drivers for + * but *not* dpm_suspend_end(). We *must* call + * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = disable_nonboot_cpus(); @@ -1579,7 +1579,7 @@ int kernel_kexec(void) local_irq_enable(); Enable_cpus: enable_nonboot_cpus(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 6d6d28870335..a5d4cf0aa03e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -245,8 +245,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, * create_image - Create a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image - * and execute the drivers' .thaw_noirq() callbacks. + * Execute device drivers' "late" and "noirq" freeze callbacks, create a + * hibernation image and run the drivers' "noirq" and "early" thaw callbacks. * * Control reappears in this routine after the subsequent restore. */ @@ -254,7 +254,7 @@ static int create_image(int platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_FREEZE); + error = dpm_suspend_end(PMSG_FREEZE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting hibernation\n"); @@ -306,7 +306,7 @@ static int create_image(int platform_mode) Platform_finish: platform_finish(platform_mode); - dpm_resume_noirq(in_suspend ? + dpm_resume_start(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); return error; @@ -394,16 +394,16 @@ int hibernation_snapshot(int platform_mode) * resume_target_kernel - Restore system state from a hibernation image. * @platform_mode: Whether or not to use the platform driver. * - * Execute device drivers' .freeze_noirq() callbacks, restore the contents of - * highmem that have not been restored yet from the image and run the low-level - * code that will restore the remaining contents of memory and switch to the - * just restored target kernel. + * Execute device drivers' "noirq" and "late" freeze callbacks, restore the + * contents of highmem that have not been restored yet from the image and run + * the low-level code that will restore the remaining contents of memory and + * switch to the just restored target kernel. */ static int resume_target_kernel(bool platform_mode) { int error; - error = dpm_suspend_noirq(PMSG_QUIESCE); + error = dpm_suspend_end(PMSG_QUIESCE); if (error) { printk(KERN_ERR "PM: Some devices failed to power down, " "aborting resume\n"); @@ -460,7 +460,7 @@ static int resume_target_kernel(bool platform_mode) Cleanup: platform_restore_cleanup(platform_mode); - dpm_resume_noirq(PMSG_RECOVER); + dpm_resume_start(PMSG_RECOVER); return error; } @@ -518,7 +518,7 @@ int hibernation_platform_enter(void) goto Resume_devices; } - error = dpm_suspend_noirq(PMSG_HIBERNATE); + error = dpm_suspend_end(PMSG_HIBERNATE); if (error) goto Resume_devices; @@ -549,7 +549,7 @@ int hibernation_platform_enter(void) Platform_finish: hibernation_ops->finish(); - dpm_resume_noirq(PMSG_RESTORE); + dpm_resume_start(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index 9824b41e5a18..8c5014a4e052 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -165,16 +165,20 @@ static int suspend_stats_show(struct seq_file *s, void *unused) last_errno %= REC_FAILED_NUM; last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; last_step %= REC_FAILED_NUM; - seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" - "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n%s: %d\n", "success", suspend_stats.success, "fail", suspend_stats.fail, "failed_freeze", suspend_stats.failed_freeze, "failed_prepare", suspend_stats.failed_prepare, "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_late", + suspend_stats.failed_suspend_late, "failed_suspend_noirq", suspend_stats.failed_suspend_noirq, "failed_resume", suspend_stats.failed_resume, + "failed_resume_early", + suspend_stats.failed_resume_early, "failed_resume_noirq", suspend_stats.failed_resume_noirq); seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4fd51beed879..560a639614a1 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -147,7 +147,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_finish; } - error = dpm_suspend_noirq(PMSG_SUSPEND); + error = dpm_suspend_end(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); goto Platform_finish; @@ -189,7 +189,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) if (suspend_ops->wake) suspend_ops->wake(); - dpm_resume_noirq(PMSG_RESUME); + dpm_resume_start(PMSG_RESUME); Platform_finish: if (suspend_ops->finish) -- cgit v1.2.3 From d031e1de2c5ba91e67ed83f6adf624543ab2b03d Mon Sep 17 00:00:00 2001 From: Alex Frid Date: Sun, 29 Jan 2012 20:39:25 +0100 Subject: PM / QoS: Simplify PM QoS expansion/merge - Replace class ID #define with enumeration - Loop through PM QoS objects during initialization (rather than initializing them one-by-one) Signed-off-by: Alex Frid Reviewed-by: Antti Miettinen Reviewed-by: Diwakar Tundlam Reviewed-by: Scott Williams Reviewed-by: Yu-Huan Hsu Acked-by: markgross Signed-off-by: Rafael J. Wysocki --- include/linux/pm_qos.h | 14 +++++++++----- kernel/power/qos.c | 23 ++++++++++------------- 2 files changed, 19 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index e5bbcbaa6f57..5ac91d8e69de 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -9,12 +9,16 @@ #include #include -#define PM_QOS_RESERVED 0 -#define PM_QOS_CPU_DMA_LATENCY 1 -#define PM_QOS_NETWORK_LATENCY 2 -#define PM_QOS_NETWORK_THROUGHPUT 3 +enum { + PM_QOS_RESERVED = 0, + PM_QOS_CPU_DMA_LATENCY, + PM_QOS_NETWORK_LATENCY, + PM_QOS_NETWORK_THROUGHPUT, + + /* insert new class ID */ + PM_QOS_NUM_CLASSES, +}; -#define PM_QOS_NUM_CLASSES 4 #define PM_QOS_DEFAULT_VALUE -1 #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 995e3bd3417b..d6d6dbd1ecc0 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -469,21 +469,18 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, static int __init pm_qos_power_init(void) { int ret = 0; + int i; - ret = register_pm_qos_misc(&cpu_dma_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); - return ret; - } - ret = register_pm_qos_misc(&network_lat_pm_qos); - if (ret < 0) { - printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); - return ret; + BUILD_BUG_ON(ARRAY_SIZE(pm_qos_array) != PM_QOS_NUM_CLASSES); + + for (i = 1; i < PM_QOS_NUM_CLASSES; i++) { + ret = register_pm_qos_misc(pm_qos_array[i]); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: %s setup failed\n", + pm_qos_array[i]->name); + return ret; + } } - ret = register_pm_qos_misc(&network_throughput_pm_qos); - if (ret < 0) - printk(KERN_ERR - "pm_qos_param: network_throughput setup failed\n"); return ret; } -- cgit v1.2.3 From 61d1d219c4c0761059236a46867bc49943c4d29d Mon Sep 17 00:00:00 2001 From: Mandeep Singh Baines Date: Mon, 30 Jan 2012 12:51:56 -0800 Subject: cgroup: remove extra calls to find_existing_css_set In cgroup_attach_proc, we indirectly call find_existing_css_set 3 times. It is an expensive call so we want to call it a minimum of times. This patch only calls it once and stores the result so that it can be used later on when we call cgroup_task_migrate. This required modifying cgroup_task_migrate to take the new css_set (which we obtained from find_css_set) as a parameter. The nice side effect of this is that cgroup_task_migrate is now identical for cgroup_attach_task and cgroup_attach_proc. It also now returns a void since it can never fail. Changes in V5: * https://lkml.org/lkml/2012/1/20/344 (Tejun Heo) * Remove css_set_refs Changes in V4: * https://lkml.org/lkml/2011/12/22/421 (Li Zefan) * Avoid GFP_KERNEL (sleep) in rcu_read_lock by getting css_set in a separate loop not under an rcu_read_lock Changes in V3: * https://lkml.org/lkml/2011/12/22/13 (Li Zefan) * Fixed earlier bug by creating a seperate patch to remove tasklist_lock Changes in V2: * https://lkml.org/lkml/2011/12/20/372 (Tejun Heo) * Move find_css_set call into loop which creates the flex array * Author * Kill css_set_refs and use group_size instead * Fix an off-by-one error in counting css_set refs * Add a retval check in out_list_teardown Signed-off-by: Mandeep Singh Baines Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: containers@lists.linux-foundation.org Cc: cgroups@vger.kernel.org Cc: KAMEZAWA Hiroyuki Cc: Frederic Weisbecker Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul Menage --- kernel/cgroup.c | 140 +++++++++++--------------------------------------------- 1 file changed, 27 insertions(+), 113 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1626152dcc1e..43a224f167b5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1763,6 +1763,7 @@ EXPORT_SYMBOL_GPL(cgroup_path); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; + struct css_set *cg; }; struct cgroup_taskset { @@ -1843,11 +1844,10 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); * will already exist. If not set, this function might sleep, and can fail with * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. */ -static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, - struct task_struct *tsk, bool guarantee) +static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, struct css_set *newcg) { struct css_set *oldcg; - struct css_set *newcg; /* * We are synchronized through threadgroup_lock() against PF_EXITING @@ -1857,23 +1857,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, WARN_ON_ONCE(tsk->flags & PF_EXITING); oldcg = tsk->cgroups; - /* locate or allocate a new css_set for this task. */ - if (guarantee) { - /* we know the css_set we want already exists. */ - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - read_lock(&css_set_lock); - newcg = find_existing_css_set(oldcg, cgrp, template); - BUG_ON(!newcg); - get_css_set(newcg); - read_unlock(&css_set_lock); - } else { - might_sleep(); - /* find_css_set will give us newcg already referenced. */ - newcg = find_css_set(oldcg, cgrp); - if (!newcg) - return -ENOMEM; - } - task_lock(tsk); rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1892,7 +1875,6 @@ static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, put_css_set(oldcg); set_bit(CGRP_RELEASABLE, &oldcgrp->flags); - return 0; } /** @@ -1910,6 +1892,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroup *oldcgrp; struct cgroupfs_root *root = cgrp->root; struct cgroup_taskset tset = { }; + struct css_set *newcg; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1939,9 +1922,13 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } } - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); - if (retval) + newcg = find_css_set(tsk->cgroups, cgrp); + if (!newcg) { + retval = -ENOMEM; goto out; + } + + cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); for_each_subsys(root, ss) { if (ss->attach) @@ -1997,66 +1984,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -/* - * cgroup_attach_proc works in two stages, the first of which prefetches all - * new css_sets needed (to make sure we have enough memory before committing - * to the move) and stores them in a list of entries of the following type. - * TODO: possible optimization: use css_set->rcu_head for chaining instead - */ -struct cg_list_entry { - struct css_set *cg; - struct list_head links; -}; - -static bool css_set_check_fetched(struct cgroup *cgrp, - struct task_struct *tsk, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - - read_lock(&css_set_lock); - newcg = find_existing_css_set(cg, cgrp, template); - read_unlock(&css_set_lock); - - /* doesn't exist at all? */ - if (!newcg) - return false; - /* see if it's already in the list */ - list_for_each_entry(cg_entry, newcg_list, links) - if (cg_entry->cg == newcg) - return true; - - /* not found */ - return false; -} - -/* - * Find the new css_set and store it in the list in preparation for moving the - * given task to the given cgroup. Returns 0 or -ENOMEM. - */ -static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, - struct list_head *newcg_list) -{ - struct css_set *newcg; - struct cg_list_entry *cg_entry; - - /* ensure a new css_set will exist for this thread */ - newcg = find_css_set(cg, cgrp); - if (!newcg) - return -ENOMEM; - /* add it to the list */ - cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); - if (!cg_entry) { - put_css_set(newcg); - return -ENOMEM; - } - cg_entry->cg = newcg; - list_add(&cg_entry->links, newcg_list); - return 0; -} - /** * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup * @cgrp: the cgroup to attach to @@ -2070,20 +1997,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) int retval, i, group_size; struct cgroup_subsys *ss, *failed_ss = NULL; /* guaranteed to be initialized later, but the compiler needs this */ - struct css_set *oldcg; struct cgroupfs_root *root = cgrp->root; /* threadgroup list cursor and array */ struct task_struct *tsk; struct task_and_cgroup *tc; struct flex_array *group; struct cgroup_taskset tset = { }; - /* - * we need to make sure we have css_sets for all the tasks we're - * going to move -before- we actually start moving them, so that in - * case we get an ENOMEM we can bail out before making any changes. - */ - struct list_head newcg_list; - struct cg_list_entry *cg_entry, *temp_nobe; /* * step 0: in order to do expensive, possibly blocking operations for @@ -2119,15 +2038,15 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); - /* - * saying GFP_ATOMIC has no effect here because we did prealloc - * earlier, but it's good form to communicate our expectations. - */ ent.task = tsk; ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) continue; + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; @@ -2160,17 +2079,12 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 2: make sure css_sets exist for all threads to be migrated. * we use find_css_set, which allocates a new one if necessary. */ - INIT_LIST_HEAD(&newcg_list); for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - oldcg = tc->task->cgroups; - - /* if we don't already have it in the list get a new one */ - if (!css_set_check_fetched(cgrp, tc->task, oldcg, - &newcg_list)) { - retval = css_set_prefetch(cgrp, oldcg, &newcg_list); - if (retval) - goto out_list_teardown; + tc->cg = find_css_set(tc->task->cgroups, cgrp); + if (!tc->cg) { + retval = -ENOMEM; + goto out_put_css_set_refs; } } @@ -2181,8 +2095,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); - BUG_ON(retval); + cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); } /* nothing is sensitive to fork() after this point. */ @@ -2200,15 +2113,16 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) synchronize_rcu(); cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; -out_list_teardown: - /* clean up the list of prefetched css_sets. */ - list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { - list_del(&cg_entry->links); - put_css_set(cg_entry->cg); - kfree(cg_entry); +out_put_css_set_refs: + if (retval) { + for (i = 0; i < group_size; i++) { + tc = flex_array_get(group, i); + if (!tc->cg) + break; + put_css_set(tc->cg); + } } out_cancel_attach: - /* same deal as in cgroup_attach_task */ if (retval) { for_each_subsys(root, ss) { if (ss == failed_ss) -- cgit v1.2.3 From ed387b781ea6e14b78f449aa2ee4f270b60b01ac Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 31 Jan 2012 11:40:32 +0900 Subject: sched: Move SMP-only variable into the SMP section This also fixes the following compilation warning on !SMP: CC kernel/sched/fair.o kernel/sched/fair.c:218:36: warning: 'max_load_balance_interval' defined but not used [-Wunused-variable] Signed-off-by: Hiroshi Shimamoto Reviewed-by: Vincent Guittot Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F2754A0.9090306@ct.jp.nec.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e77a6bd597b..4ab60a24ea49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -215,8 +215,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, const struct sched_class fair_sched_class; -static unsigned long __read_mostly max_load_balance_interval = HZ/10; - /************************************************************** * CFS operations on generic schedulable entities: */ @@ -3086,6 +3084,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp * Fair scheduling class load-balancing methods: */ +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. -- cgit v1.2.3 From fe9161db2e6053da21e4649d77bbefaf3030b11d Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 1 Feb 2012 22:16:36 +0100 Subject: PM / Hibernate: Thaw kernel threads in SNAPSHOT_CREATE_IMAGE ioctl path In the SNAPSHOT_CREATE_IMAGE ioctl, if the call to hibernation_snapshot() fails, the frozen tasks are not thawed. And in the case of success, if we happen to exit due to a successful freezer test, all tasks (including those of userspace) are thawed, whereas actually we should have thawed only the kernel threads at that point. Fix both these issues. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/power/user.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index e5a21a857302..3e100075b13c 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,13 +249,15 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (!error) { + if (error) { + thaw_kernel_threads(); + } else { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; if (freezer_test_done) { freezer_test_done = false; - thaw_processes(); + thaw_kernel_threads(); } } break; -- cgit v1.2.3 From 761b3ef50e1c2649cffbfa67a4dcb2dcdb7982ed Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 31 Jan 2012 13:47:36 +0800 Subject: cgroup: remove cgroup_subsys argument from callbacks The argument is not used at all, and it's not necessary, because a specific callback handler of course knows which subsys it belongs to. Now only ->pupulate() takes this argument, because the handlers of this callback always call cgroup_add_file()/cgroup_add_files(). So we reduce a few lines of code, though the shrinking of object size is minimal. 16 files changed, 113 insertions(+), 162 deletions(-) text data bss dec hex filename 5486240 656987 7039960 13183187 c928d3 vmlinux.o.orig 5486170 656987 7039960 13183117 c9288d vmlinux.o Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- Documentation/cgroups/cgroups.txt | 26 +++++++++------------ block/blk-cgroup.c | 22 +++++++----------- include/linux/cgroup.h | 29 ++++++++++------------- include/net/sock.h | 7 +++--- include/net/tcp_memcontrol.h | 2 +- kernel/cgroup.c | 43 +++++++++++++++++------------------ kernel/cgroup_freezer.c | 11 ++++----- kernel/cpuset.c | 16 ++++--------- kernel/events/core.c | 13 ++++------- kernel/sched/core.c | 20 +++++++--------- mm/memcontrol.c | 48 ++++++++++++++++----------------------- net/core/netprio_cgroup.c | 10 ++++---- net/core/sock.c | 6 ++--- net/ipv4/tcp_memcontrol.c | 2 +- net/sched/cls_cgroup.c | 10 ++++---- security/device_cgroup.c | 10 ++++---- 16 files changed, 113 insertions(+), 162 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index a7c96ae5557c..8e74980ab385 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -558,8 +558,7 @@ Each subsystem may export the following methods. The only mandatory methods are create/destroy. Any others that are null are presumed to be successful no-ops. -struct cgroup_subsys_state *create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +struct cgroup_subsys_state *create(struct cgroup *cgrp) (cgroup_mutex held by caller) Called to create a subsystem state object for a cgroup. The @@ -574,7 +573,7 @@ identified by the passed cgroup object having a NULL parent (since it's the root of the hierarchy) and may be an appropriate place for initialization code. -void destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +void destroy(struct cgroup *cgrp) (cgroup_mutex held by caller) The cgroup system is about to destroy the passed cgroup; the subsystem @@ -585,7 +584,7 @@ cgroup->parent is still valid. (Note - can also be called for a newly-created cgroup if an error occurs after this subsystem's create() method has been called for the new cgroup). -int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +int pre_destroy(struct cgroup *cgrp); Called before checking the reference count on each subsystem. This may be useful for subsystems which have some extra references even if @@ -593,8 +592,7 @@ there are not tasks in the cgroup. If pre_destroy() returns error code, rmdir() will fail with it. From this behavior, pre_destroy() can be called multiple times against a cgroup. -int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +int can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) Called prior to moving one or more tasks into a cgroup; if the @@ -615,8 +613,7 @@ fork. If this method returns 0 (success) then this should remain valid while the caller holds cgroup_mutex and it is ensured that either attach() or cancel_attach() will be called in future. -void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) Called when a task attach operation has failed after can_attach() has succeeded. @@ -625,23 +622,22 @@ function, so that the subsystem can implement a rollback. If not, not necessary. This will be called only about subsystems whose can_attach() operation have succeeded. The parameters are identical to can_attach(). -void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +void attach(struct cgroup *cgrp, struct cgroup_taskset *tset) (cgroup_mutex held by caller) Called after the task has been attached to the cgroup, to allow any post-attachment activity that requires memory allocations or blocking. The parameters are identical to can_attach(). -void fork(struct cgroup_subsy *ss, struct task_struct *task) +void fork(struct task_struct *task) Called when a task is forked into a cgroup. -void exit(struct cgroup_subsys *ss, struct task_struct *task) +void exit(struct task_struct *task) Called during task exit. -int populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +int populate(struct cgroup *cgrp) (cgroup_mutex held by caller) Called after creation of a cgroup to allow a subsystem to populate @@ -651,7 +647,7 @@ include/linux/cgroup.h for details). Note that although this method can return an error code, the error code is currently not always handled well. -void post_clone(struct cgroup_subsys *ss, struct cgroup *cgrp) +void post_clone(struct cgroup *cgrp) (cgroup_mutex held by caller) Called during cgroup_create() to do any parameter @@ -659,7 +655,7 @@ initialization which might be required before a task could attach. For example in cpusets, no task may attach before 'cpus' and 'mems' are set up. -void bind(struct cgroup_subsys *ss, struct cgroup *root) +void bind(struct cgroup *root) (cgroup_mutex and ss->hierarchy_mutex held by caller) Called when a cgroup subsystem is rebound to a different hierarchy diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f26309444..1359d637831f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -28,13 +28,10 @@ static LIST_HEAD(blkio_list); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); -static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, - struct cgroup *); -static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup_taskset *); -static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, - struct cgroup_taskset *); -static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); +static struct cgroup_subsys_state *blkiocg_create(struct cgroup *); +static int blkiocg_can_attach(struct cgroup *, struct cgroup_taskset *); +static void blkiocg_attach(struct cgroup *, struct cgroup_taskset *); +static void blkiocg_destroy(struct cgroup *); static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); /* for encoding cft->private value on file */ @@ -1548,7 +1545,7 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) ARRAY_SIZE(blkio_files)); } -static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +static void blkiocg_destroy(struct cgroup *cgroup) { struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); unsigned long flags; @@ -1598,8 +1595,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) kfree(blkcg); } -static struct cgroup_subsys_state * -blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +static struct cgroup_subsys_state *blkiocg_create(struct cgroup *cgroup) { struct blkio_cgroup *blkcg; struct cgroup *parent = cgroup->parent; @@ -1628,8 +1624,7 @@ done: * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int blkiocg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; @@ -1648,8 +1643,7 @@ static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return ret; } -static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void blkiocg_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 7da3e745b74c..501adb1b2f43 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -452,23 +452,18 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); */ struct cgroup_subsys { - struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, - struct cgroup *cgrp); - int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); - int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset); - void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset); - void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); - void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task); - int (*populate)(struct cgroup_subsys *ss, - struct cgroup *cgrp); - void (*post_clone)(struct cgroup_subsys *ss, struct cgroup *cgrp); - void (*bind)(struct cgroup_subsys *ss, struct cgroup *root); + struct cgroup_subsys_state *(*create)(struct cgroup *cgrp); + int (*pre_destroy)(struct cgroup *cgrp); + void (*destroy)(struct cgroup *cgrp); + int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + void (*fork)(struct task_struct *task); + void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task); + int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp); + void (*post_clone)(struct cgroup *cgrp); + void (*bind)(struct cgroup *root); int subsys_id; int active; diff --git a/include/net/sock.h b/include/net/sock.h index bb972d254dff..705d1add19a1 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -68,7 +68,7 @@ struct cgroup; struct cgroup_subsys; #ifdef CONFIG_NET int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss); -void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss); +void mem_cgroup_sockets_destroy(struct cgroup *cgrp); #else static inline int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) @@ -76,7 +76,7 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) return 0; } static inline -void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +void mem_cgroup_sockets_destroy(struct cgroup *cgrp) { } #endif @@ -869,8 +869,7 @@ struct proto { */ int (*init_cgroup)(struct cgroup *cgrp, struct cgroup_subsys *ss); - void (*destroy_cgroup)(struct cgroup *cgrp, - struct cgroup_subsys *ss); + void (*destroy_cgroup)(struct cgroup *cgrp); struct cg_proto *(*proto_cgroup)(struct mem_cgroup *memcg); #endif }; diff --git a/include/net/tcp_memcontrol.h b/include/net/tcp_memcontrol.h index 3512082fa909..48410ff25c9e 100644 --- a/include/net/tcp_memcontrol.h +++ b/include/net/tcp_memcontrol.h @@ -13,7 +13,7 @@ struct tcp_memcontrol { struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg); int tcp_init_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); -void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss); +void tcp_destroy_cgroup(struct cgroup *cgrp); unsigned long long tcp_max_memory(const struct mem_cgroup *memcg); void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx); #endif /* _TCP_MEMCG_H */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 43a224f167b5..865d89a580c7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -818,7 +818,7 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { - ret = ss->pre_destroy(ss, cgrp); + ret = ss->pre_destroy(cgrp); if (ret) break; } @@ -846,7 +846,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1015,7 +1015,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(ss, cgrp); + ss->bind(cgrp); mutex_unlock(&ss->hierarchy_mutex); /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { @@ -1025,7 +1025,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); if (ss->bind) - ss->bind(ss, dummytop); + ss->bind(dummytop); dummytop->subsys[i]->cgroup = dummytop; cgrp->subsys[i] = NULL; subsys[i]->root = &rootnode; @@ -1908,7 +1908,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1932,7 +1932,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } synchronize_rcu(); @@ -1954,7 +1954,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } return retval; @@ -2067,7 +2067,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, &tset); + retval = ss->can_attach(cgrp, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2104,7 +2104,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) */ for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, &tset); + ss->attach(cgrp, &tset); } /* @@ -2128,7 +2128,7 @@ out_cancel_attach: if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, &tset); + ss->cancel_attach(cgrp, &tset); } } out_free_group_list: @@ -3756,7 +3756,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { - struct cgroup_subsys_state *css = ss->create(ss, cgrp); + struct cgroup_subsys_state *css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); @@ -3770,7 +3770,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) - ss->post_clone(ss, cgrp); + ss->post_clone(cgrp); } cgroup_lock_hierarchy(root); @@ -3804,7 +3804,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(ss, cgrp); + ss->destroy(cgrp); } mutex_unlock(&cgroup_mutex); @@ -4028,7 +4028,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(ss, dummytop); + css = ss->create(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4117,7 +4117,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * no ss->create seems to need anything important in the ss struct, so * this can happen first (i.e. before the rootnode attachment). */ - css = ss->create(ss, dummytop); + css = ss->create(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[i] = NULL; @@ -4135,7 +4135,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) int ret = cgroup_init_idr(ss, css); if (ret) { dummytop->subsys[ss->subsys_id] = NULL; - ss->destroy(ss, dummytop); + ss->destroy(dummytop); subsys[i] = NULL; mutex_unlock(&cgroup_mutex); return ret; @@ -4233,7 +4233,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * pointer to find their state. note that this also takes care of * freeing the css_id. */ - ss->destroy(ss, dummytop); + ss->destroy(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4509,7 +4509,7 @@ void cgroup_fork_callbacks(struct task_struct *child) for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) - ss->fork(ss, child); + ss->fork(child); } } } @@ -4611,7 +4611,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct cgroup *old_cgrp = rcu_dereference_raw(cg->subsys[i])->cgroup; struct cgroup *cgrp = task_cgroup(tsk, i); - ss->exit(ss, cgrp, old_cgrp, tsk); + ss->exit(cgrp, old_cgrp, tsk); } } } @@ -5066,8 +5066,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *debug_create(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5077,7 +5076,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, return css; } -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void debug_destroy(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fc0646b78a64..f86e93920b62 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -128,8 +128,7 @@ struct cgroup_subsys freezer_subsys; * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) * sighand->siglock */ -static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -142,8 +141,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, return &freezer->css; } -static void freezer_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -164,8 +162,7 @@ static bool is_task_frozen_enough(struct task_struct *task) * a write to that file racing against an attach, and hence the * can_attach() result will remain valid until the attach completes. */ -static int freezer_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, +static int freezer_can_attach(struct cgroup *new_cgroup, struct cgroup_taskset *tset) { struct freezer *freezer; @@ -185,7 +182,7 @@ static int freezer_can_attach(struct cgroup_subsys *ss, return 0; } -static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..5d575836dba6 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1399,8 +1399,7 @@ static nodemask_t cpuset_attach_nodemask_from; static nodemask_t cpuset_attach_nodemask_to; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct cpuset *cs = cgroup_cs(cgrp); struct task_struct *task; @@ -1436,8 +1435,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct mm_struct *mm; struct task_struct *task; @@ -1833,8 +1831,7 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) * (and likewise for mems) to the new cgroup. Called with cgroup_mutex * held. */ -static void cpuset_post_clone(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void cpuset_post_clone(struct cgroup *cgroup) { struct cgroup *parent, *child; struct cpuset *cs, *parent_cs; @@ -1857,13 +1854,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss, /* * cpuset_create - create a cpuset - * ss: cpuset cgroup subsystem * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create( - struct cgroup_subsys *ss, - struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1902,7 +1896,7 @@ static struct cgroup_subsys_state *cpuset_create( * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +static void cpuset_destroy(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); diff --git a/kernel/events/core.c b/kernel/events/core.c index a8f4ac001a00..a5d1ee92b0d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6906,8 +6906,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create( - struct cgroup_subsys *ss, struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) { struct perf_cgroup *jc; @@ -6924,8 +6923,7 @@ static struct cgroup_subsys_state *perf_cgroup_create( return &jc->css; } -static void perf_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void perf_cgroup_destroy(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -6941,8 +6939,7 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -6950,8 +6947,8 @@ static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index df00cb09263e..ff12f7216062 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7530,8 +7530,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state * -cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7548,15 +7547,14 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) return &tg->css; } -static void -cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpu_cgroup_destroy(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); sched_destroy_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7574,7 +7572,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *task; @@ -7584,8 +7582,8 @@ static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } static void -cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task) +cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. @@ -7935,8 +7933,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { */ /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create( - struct cgroup_subsys *ss, struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) { struct cpuacct *ca; @@ -7966,8 +7963,7 @@ out: } /* destroy an existing cpu accounting group */ -static void -cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cpuacct_destroy(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dbff4dcde35..ae2f0a8ab761 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4580,10 +4580,9 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(cont, ss); }; -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { - mem_cgroup_sockets_destroy(cont, ss); + mem_cgroup_sockets_destroy(cont); } #else static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) @@ -4591,8 +4590,7 @@ static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void kmem_cgroup_destroy(struct cgroup *cont) { } #endif @@ -4884,7 +4882,7 @@ err_cleanup: } static struct cgroup_subsys_state * __ref -mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) +mem_cgroup_create(struct cgroup *cont) { struct mem_cgroup *memcg, *parent; long error = -ENOMEM; @@ -4946,20 +4944,18 @@ free_out: return ERR_PTR(error); } -static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static int mem_cgroup_pre_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); return mem_cgroup_force_empty(memcg, false); } -static void mem_cgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cont) +static void mem_cgroup_destroy(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(ss, cont); + kmem_cgroup_destroy(cont); mem_cgroup_put(memcg); } @@ -5296,9 +5292,8 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; @@ -5336,9 +5331,8 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); } @@ -5453,9 +5447,8 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); struct mm_struct *mm = get_task_mm(p); @@ -5470,20 +5463,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static int mem_cgroup_can_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, - struct cgroup *cgroup, - struct cgroup_taskset *tset) +static void mem_cgroup_cancel_attach(struct cgroup *cgroup, + struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup_subsys *ss, - struct cgroup *cont, - struct cgroup_taskset *tset) +static void mem_cgroup_move_task(struct cgroup *cont, + struct cgroup_taskset *tset) { } #endif diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3a9fd4826b75..22036ab732cf 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -23,9 +23,8 @@ #include #include -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup *cgrp); static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); struct cgroup_subsys net_prio_subsys = { @@ -120,8 +119,7 @@ static void update_netdev_tables(void) rtnl_unlock(); } -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; int ret; @@ -145,7 +143,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, return &cs->css; } -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cgrp_destroy(struct cgroup *cgrp) { struct cgroup_netprio_state *cs; struct net_device *dev; diff --git a/net/core/sock.c b/net/core/sock.c index 5c5af9988f94..688037cb3b6e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -160,19 +160,19 @@ int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss) out: list_for_each_entry_continue_reverse(proto, &proto_list, node) if (proto->destroy_cgroup) - proto->destroy_cgroup(cgrp, ss); + proto->destroy_cgroup(cgrp); mutex_unlock(&proto_list_mutex); return ret; } -void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) +void mem_cgroup_sockets_destroy(struct cgroup *cgrp) { struct proto *proto; mutex_lock(&proto_list_mutex); list_for_each_entry_reverse(proto, &proto_list, node) if (proto->destroy_cgroup) - proto->destroy_cgroup(cgrp, ss); + proto->destroy_cgroup(cgrp); mutex_unlock(&proto_list_mutex); } #endif diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..e714c6834c90 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -94,7 +94,7 @@ create_files: } EXPORT_SYMBOL(tcp_init_cgroup); -void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) +void tcp_destroy_cgroup(struct cgroup *cgrp) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct cg_proto *cg_proto; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index f84fdc3a7f27..1afaa284fcd7 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -22,9 +22,8 @@ #include #include -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp); -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp); +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp); +static void cgrp_destroy(struct cgroup *cgrp); static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp); struct cgroup_subsys net_cls_subsys = { @@ -51,8 +50,7 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) struct cgroup_cls_state, css); } -static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, - struct cgroup *cgrp) +static struct cgroup_subsys_state *cgrp_create(struct cgroup *cgrp) { struct cgroup_cls_state *cs; @@ -66,7 +64,7 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss, return &cs->css; } -static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void cgrp_destroy(struct cgroup *cgrp) { kfree(cgrp_cls_state(cgrp)); } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 8b5b5d8612c6..c43a3323feea 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -61,8 +61,8 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; -static int devcgroup_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgrp, struct cgroup_taskset *set) +static int devcgroup_can_attach(struct cgroup *new_cgrp, + struct cgroup_taskset *set) { struct task_struct *task = cgroup_taskset_first(set); @@ -156,8 +156,7 @@ remove: /* * called from kernel/cgroup.c with cgroup_lock() held. */ -static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) { struct dev_cgroup *dev_cgroup, *parent_dev_cgroup; struct cgroup *parent_cgroup; @@ -195,8 +194,7 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup_subsys *ss, return &dev_cgroup->css; } -static void devcgroup_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) +static void devcgroup_destroy(struct cgroup *cgroup) { struct dev_cgroup *dev_cgroup; struct dev_whitelist_item *wh, *tmp; -- cgit v1.2.3 From a80b83b7b8456e9b475346c2e01d7e210883208c Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 3 Feb 2012 00:19:07 -0800 Subject: Input: add infrastructure for selecting clockid for event time stamps As noted by Arve and others, since wall time can jump backwards, it is difficult to use for input because one cannot determine if one event occurred before another or for how long a key was pressed. However, the timestamp field is part of the kernel ABI, and cannot be changed without possibly breaking existing users. This patch adds a new IOCTL that allows a clockid to be set in the evdev_client struct that will specify which time base to use for event timestamps (ie: CLOCK_MONOTONIC instead of CLOCK_REALTIME). For now we only support CLOCK_MONOTONIC and CLOCK_REALTIME, but in the future we could support other clockids if appropriate. The default remains CLOCK_REALTIME, so we don't change the ABI. Signed-off-by: John Stultz Reviewed-by: Daniel Kurtz Signed-off-by: Dmitry Torokhov --- drivers/input/evdev.c | 25 +++++++++++++++++++++---- include/linux/input.h | 2 ++ kernel/time/timekeeping.c | 2 ++ 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index 76457d50bc34..c17409742999 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -46,6 +46,7 @@ struct evdev_client { struct fasync_struct *fasync; struct evdev *evdev; struct list_head node; + int clkid; unsigned int bufsize; struct input_event buffer[]; }; @@ -54,8 +55,12 @@ static struct evdev *evdev_table[EVDEV_MINORS]; static DEFINE_MUTEX(evdev_table_mutex); static void evdev_pass_event(struct evdev_client *client, - struct input_event *event) + struct input_event *event, + ktime_t mono, ktime_t real) { + event->time = ktime_to_timeval(client->clkid == CLOCK_MONOTONIC ? + mono : real); + /* Interrupts are disabled, just acquire the lock. */ spin_lock(&client->buffer_lock); @@ -94,8 +99,11 @@ static void evdev_event(struct input_handle *handle, struct evdev *evdev = handle->private; struct evdev_client *client; struct input_event event; + ktime_t time_mono, time_real; + + time_mono = ktime_get(); + time_real = ktime_sub(time_mono, ktime_get_monotonic_offset()); - do_gettimeofday(&event.time); event.type = type; event.code = code; event.value = value; @@ -103,11 +111,12 @@ static void evdev_event(struct input_handle *handle, rcu_read_lock(); client = rcu_dereference(evdev->grab); + if (client) - evdev_pass_event(client, &event); + evdev_pass_event(client, &event, time_mono, time_real); else list_for_each_entry_rcu(client, &evdev->client_list, node) - evdev_pass_event(client, &event); + evdev_pass_event(client, &event, time_mono, time_real); rcu_read_unlock(); @@ -685,6 +694,14 @@ static long evdev_do_ioctl(struct file *file, unsigned int cmd, else return evdev_ungrab(evdev, client); + case EVIOCSCLOCKID: + if (copy_from_user(&i, p, sizeof(unsigned int))) + return -EFAULT; + if (i != CLOCK_MONOTONIC && i != CLOCK_REALTIME) + return -EINVAL; + client->clkid = i; + return 0; + case EVIOCGKEYCODE: return evdev_handle_get_keycode(dev, p); diff --git a/include/linux/input.h b/include/linux/input.h index 3862e32c4eeb..177261ea6f52 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -129,6 +129,8 @@ struct input_keymap_entry { #define EVIOCGRAB _IOW('E', 0x90, int) /* Grab/Release device */ +#define EVIOCSCLOCKID _IOW('E', 0xa0, int) /* Set clockid to be used for timestamps */ + /* * Device properties and quirks */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2b021b0e8507..169479994755 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1140,6 +1140,8 @@ ktime_t ktime_get_monotonic_offset(void) } while (read_seqretry(&xtime_lock, seq)); return timespec_to_ktime(wtom); } +EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); + /** * xtime_update() - advances the timekeeping infrastructure -- cgit v1.2.3 From ac483c446b67870444c9eeaf8325d3d2af9b91bc Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 2 Jan 2012 10:04:14 +0100 Subject: ftrace: Change filter/notrace set functions to return exit code Currently the ftrace_set_filter and ftrace_set_notrace functions do not return any return code. So there's no way for ftrace_ops user to tell wether the filter was correctly applied. The set_ftrace_filter interface returns error in case the filter did not match: # echo krava > set_ftrace_filter bash: echo: write error: Invalid argument Changing both ftrace_set_filter and ftrace_set_notrace functions to return zero if the filter was applied correctly or -E* values in case of error. Link: http://lkml.kernel.org/r/1325495060-6402-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 ++-- kernel/trace/ftrace.c | 15 +++++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 028e26f0bf08..f33fb3b041c6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -178,9 +178,9 @@ struct dyn_ftrace { }; int ftrace_force_update(void); -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 683d559a0eef..e2e0597c0845 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3146,8 +3146,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_regex_lock); if (reset) ftrace_filter_reset(hash); - if (buf) - ftrace_match_records(hash, buf, len); + if (buf && !ftrace_match_records(hash, buf, len)) { + ret = -EINVAL; + goto out_regex_unlock; + } mutex_lock(&ftrace_lock); ret = ftrace_hash_move(ops, enable, orig_hash, hash); @@ -3157,6 +3159,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_unlock(&ftrace_lock); + out_regex_unlock: mutex_unlock(&ftrace_regex_lock); free_ftrace_hash(hash); @@ -3173,10 +3176,10 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, * Filters denote which functions should be enabled when tracing is enabled. * If @buf is NULL and reset is set, all functions will be enabled for tracing. */ -void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 1); + return ftrace_set_regex(ops, buf, len, reset, 1); } EXPORT_SYMBOL_GPL(ftrace_set_filter); @@ -3191,10 +3194,10 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter); * is enabled. If @buf is NULL and reset is set, all functions will be enabled * for tracing. */ -void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, +int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset) { - ftrace_set_regex(ops, buf, len, reset, 0); + return ftrace_set_regex(ops, buf, len, reset, 0); } EXPORT_SYMBOL_GPL(ftrace_set_notrace); /** -- cgit v1.2.3 From f069686e4bdc60a637d210ea3eea25fcdb82df88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Jan 2012 20:18:55 -0500 Subject: tracing/softirq: Move __raise_softirq_irqoff() out of header The __raise_softirq_irqoff() contains a tracepoint. As tracepoints in headers can cause issues, and not to mention, bloats the kernel when they are in a static inline, it is best to move the function that contains the tracepoint out of the header and into softirq.c. Link: http://lkml.kernel.org/r/20120118120711.GB14863@elte.hu Suggested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/linux/interrupt.h | 7 +------ kernel/irq/chip.c | 2 ++ kernel/softirq.c | 6 ++++++ 3 files changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a64b00e286f5..3f830e005118 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -20,7 +20,6 @@ #include #include #include -#include /* * These correspond to the IORESOURCE_IRQ_* defines in @@ -456,11 +455,7 @@ asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -static inline void __raise_softirq_irqoff(unsigned int nr) -{ - trace_softirq_raise(nr); - or_softirq_pending(1UL << nr); -} +extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..fc418249f01f 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -16,6 +16,8 @@ #include #include +#include + #include "internals.h" /** diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..06d40993594a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -385,6 +385,12 @@ void raise_softirq(unsigned int nr) local_irq_restore(flags); } +void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise(nr); + or_softirq_pending(1UL << nr); +} + void open_softirq(int nr, void (*action)(struct softirq_action *)) { softirq_vec[nr].action = action; -- cgit v1.2.3 From 55ca6140e9bb307efc97a9301a4f501de02a6fd6 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 3 Feb 2012 15:37:16 -0800 Subject: kprobes: fix a memory leak in function pre_handler_kretprobe() In function pre_handler_kretprobe(), the allocated kretprobe_instance object will get leaked if the entry_handler callback returns non-zero. This may cause all the preallocated kretprobe_instance objects exhausted. This issue can be reproduced by changing samples/kprobes/kretprobe_example.c to probe "mutex_unlock". And the fix is straightforward: just put the allocated kretprobe_instance object back onto the free_instances list. [akpm@linux-foundation.org: use raw_spin_lock/unlock] Signed-off-by: Jiang Liu Acked-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 29f5b65bee29..9788c0ec6f43 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1673,8 +1673,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, ri->rp = rp; ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + raw_spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; + } arch_prepare_kretprobe(ri, regs); -- cgit v1.2.3 From 379e0be812ab8a2a351e784b0c987788f5123090 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 3 Feb 2012 22:22:41 +0100 Subject: PM / Freezer: Thaw only kernel threads if freezing of kernel threads fails If freezing of kernel threads fails, we are expected to automatically thaw tasks in the error recovery path. However, at times, we encounter situations in which we would like the automatic error recovery path to thaw only the kernel threads, because we want to be able to do some more cleanup before we thaw userspace. Something like: error = freeze_kernel_threads(); if (error) { /* Do some cleanup */ /* Only then thaw userspace tasks*/ thaw_processes(); } An example of such a situation is where we freeze/thaw filesystems during suspend/hibernation. There, if freezing of kernel threads fails, we would like to thaw the frozen filesystems before thawing the userspace tasks. So, modify freeze_kernel_threads() to thaw only kernel threads in case of freezing failure. And change suspend_freeze_processes() accordingly. (At the same time, let us also get rid of the rather cryptic usage of the conditional operator (:?) in that function.) [rjw: In fact, this patch fixes a regression introduced during the 3.3 merge window, because without it thaw_processes() may be called before swsusp_free() in some situations and that may lead to massive memory allocation failures.] Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Acked-by: Nigel Cunningham Signed-off-by: Rafael J. Wysocki --- kernel/power/power.h | 24 ++++++++++++++++++++++-- kernel/power/process.c | 7 +++++-- 2 files changed, 27 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 0c4defe6d3b8..21724eee5206 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -231,8 +231,28 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - int error = freeze_processes(); - return error ? : freeze_kernel_threads(); + int error; + + error = freeze_processes(); + + /* + * freeze_processes() automatically thaws every task if freezing + * fails. So we need not do anything extra upon error. + */ + if (error) + goto Finish; + + error = freeze_kernel_threads(); + + /* + * freeze_kernel_threads() thaws only kernel threads upon freezing + * failure. So we have to thaw the userspace tasks ourselves. + */ + if (error) + thaw_processes(); + + Finish: + return error; } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index eeca00311f39..7e426459e60a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -143,7 +143,10 @@ int freeze_processes(void) /** * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. * - * On success, returns 0. On failure, -errno and system is fully thawed. + * On success, returns 0. On failure, -errno and only the kernel threads are + * thawed, so as to give a chance to the caller to do additional cleanups + * (if any) before thawing the userspace tasks. So, it is the responsibility + * of the caller to thaw the userspace tasks, when the time is right. */ int freeze_kernel_threads(void) { @@ -159,7 +162,7 @@ int freeze_kernel_threads(void) BUG_ON(in_atomic()); if (error) - thaw_processes(); + thaw_kernel_threads(); return error; } -- cgit v1.2.3 From 11a3122f6cf2d988a77eb8883d0fc49cd013a6d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 7 Feb 2012 07:51:30 +0100 Subject: block: strip out locking optimization in put_io_context() put_io_context() performed a complex trylock dancing to avoid deferring ioc release to workqueue. It was also broken on UP because trylock was always assumed to succeed which resulted in unbalanced preemption count. While there are ways to fix the UP breakage, even the most pathological microbench (forced ioc allocation and tight fork/exit loop) fails to show any appreciable performance benefit of the optimization. Strip it out. If there turns out to be workloads which are affected by this change, simpler optimization from the discussion thread can be applied later. Signed-off-by: Tejun Heo LKML-Reference: <1328514611.21268.66.camel@sli10-conroe> Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- block/blk-core.c | 2 +- block/blk-ioc.c | 92 ++++++----------------------------------------- block/cfq-iosched.c | 2 +- fs/ioprio.c | 2 +- include/linux/blkdev.h | 3 -- include/linux/iocontext.h | 5 ++- kernel/fork.c | 2 +- 8 files changed, 18 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index fa8f26309444..75642a352a8f 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1659,7 +1659,7 @@ static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_cgroup_changed(ioc); - put_io_context(ioc, NULL); + put_io_context(ioc); } } } diff --git a/block/blk-core.c b/block/blk-core.c index 636702575118..532b3a21b383 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -642,7 +642,7 @@ static inline void blk_free_request(struct request_queue *q, struct request *rq) if (rq->cmd_flags & REQ_ELVPRIV) { elv_put_request(q, rq); if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc, q); + put_io_context(rq->elv.icq->ioc); } mempool_free(rq, q->rq.rq_pool); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 7490b6da2453..9884fd7427fe 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -29,21 +29,6 @@ void get_io_context(struct io_context *ioc) } EXPORT_SYMBOL(get_io_context); -/* - * Releasing ioc may nest into another put_io_context() leading to nested - * fast path release. As the ioc's can't be the same, this is okay but - * makes lockdep whine. Keep track of nesting and use it as subclass. - */ -#ifdef CONFIG_LOCKDEP -#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0) -#define ioc_release_depth_inc(q) (q)->ioc_release_depth++ -#define ioc_release_depth_dec(q) (q)->ioc_release_depth-- -#else -#define ioc_release_depth(q) 0 -#define ioc_release_depth_inc(q) do { } while (0) -#define ioc_release_depth_dec(q) do { } while (0) -#endif - static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); @@ -75,11 +60,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (rcu_dereference_raw(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); - if (et->ops.elevator_exit_icq_fn) { - ioc_release_depth_inc(q); + if (et->ops.elevator_exit_icq_fn) et->ops.elevator_exit_icq_fn(icq); - ioc_release_depth_dec(q); - } /* * @icq->q might have gone away by the time RCU callback runs @@ -149,81 +131,29 @@ static void ioc_release_fn(struct work_struct *work) /** * put_io_context - put a reference of io_context * @ioc: io_context to put - * @locked_q: request_queue the caller is holding queue_lock of (hint) * * Decrement reference count of @ioc and release it if the count reaches - * zero. If the caller is holding queue_lock of a queue, it can indicate - * that with @locked_q. This is an optimization hint and the caller is - * allowed to pass in %NULL even when it's holding a queue_lock. + * zero. */ -void put_io_context(struct io_context *ioc, struct request_queue *locked_q) +void put_io_context(struct io_context *ioc) { - struct request_queue *last_q = locked_q; unsigned long flags; if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - if (locked_q) - lockdep_assert_held(locked_q->queue_lock); - - if (!atomic_long_dec_and_test(&ioc->refcount)) - return; /* - * Destroy @ioc. This is a bit messy because icq's are chained - * from both ioc and queue, and ioc->lock nests inside queue_lock. - * The inner ioc->lock should be held to walk our icq_list and then - * for each icq the outer matching queue_lock should be grabbed. - * ie. We need to do reverse-order double lock dancing. - * - * Another twist is that we are often called with one of the - * matching queue_locks held as indicated by @locked_q, which - * prevents performing double-lock dance for other queues. - * - * So, we do it in two stages. The fast path uses the queue_lock - * the caller is holding and, if other queues need to be accessed, - * uses trylock to avoid introducing locking dependency. This can - * handle most cases, especially if @ioc was performing IO on only - * single device. - * - * If trylock doesn't cut it, we defer to @ioc->release_work which - * can do all the double-locking dancing. + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. */ - spin_lock_irqsave_nested(&ioc->lock, flags, - ioc_release_depth(locked_q)); - - while (!hlist_empty(&ioc->icq_list)) { - struct io_cq *icq = hlist_entry(ioc->icq_list.first, - struct io_cq, ioc_node); - struct request_queue *this_q = icq->q; - - if (this_q != last_q) { - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - last_q = NULL; - - /* spin_trylock() always successes in UP case */ - if (this_q != locked_q && - !spin_trylock(this_q->queue_lock)) - break; - last_q = this_q; - continue; - } - ioc_exit_icq(icq); + if (atomic_long_dec_and_test(&ioc->refcount)) { + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + schedule_work(&ioc->release_work); + spin_unlock_irqrestore(&ioc->lock, flags); } - - if (last_q && last_q != locked_q) - spin_unlock(last_q->queue_lock); - - spin_unlock_irqrestore(&ioc->lock, flags); - - /* if no icq is left, we're done; otherwise, kick release_work */ - if (hlist_empty(&ioc->icq_list)) - kmem_cache_free(iocontext_cachep, ioc); - else - schedule_work(&ioc->release_work); } EXPORT_SYMBOL(put_io_context); @@ -238,7 +168,7 @@ void exit_io_context(struct task_struct *task) task_unlock(task); atomic_dec(&ioc->nr_tasks); - put_io_context(ioc, NULL); + put_io_context(ioc); } /** diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index da21c24dbed3..5684df6848bc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1794,7 +1794,7 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqd->active_queue = NULL; if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue); + put_io_context(cfqd->active_cic->icq.ioc); cfqd->active_cic = NULL; } } diff --git a/fs/ioprio.c b/fs/ioprio.c index f84b380d65e5..0f1b9515213b 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -51,7 +51,7 @@ int set_task_ioprio(struct task_struct *task, int ioprio) ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); if (ioc) { ioc_ioprio_changed(ioc, ioprio); - put_io_context(ioc, NULL); + put_io_context(ioc); } return err; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6c6a1f008065..606cf339bb56 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -399,9 +399,6 @@ struct request_queue { /* Throttle data */ struct throtl_data *td; #endif -#ifdef CONFIG_LOCKDEP - int ioc_release_depth; -#endif }; #define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 7e1371c4bccf..119773eebe31 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -133,7 +133,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) struct task_struct; #ifdef CONFIG_BLOCK -void put_io_context(struct io_context *ioc, struct request_queue *locked_q); +void put_io_context(struct io_context *ioc); void exit_io_context(struct task_struct *task); struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node); @@ -141,8 +141,7 @@ void ioc_ioprio_changed(struct io_context *ioc, int ioprio); void ioc_cgroup_changed(struct io_context *ioc); #else struct io_context; -static inline void put_io_context(struct io_context *ioc, - struct request_queue *locked_q) { } +static inline void put_io_context(struct io_context *ioc) { } static inline void exit_io_context(struct task_struct *task) { } #endif diff --git a/kernel/fork.c b/kernel/fork.c index 051f090d40c1..c574aefa8d1b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -890,7 +890,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; new_ioc->ioprio = ioc->ioprio; - put_io_context(new_ioc, NULL); + put_io_context(new_ioc); } #endif return 0; -- cgit v1.2.3 From f39d47ff819ed52a2afbdbecbe35f23f7755f58d Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 7 Feb 2012 14:39:57 +0100 Subject: perf: Fix double start/stop in x86_pmu_start() The following patch fixes a bug introduced by the following commit: e050e3f0a71b ("perf: Fix broken interrupt rate throttling") The patch caused the following warning to pop up depending on the sampling frequency adjustments: ------------[ cut here ]------------ WARNING: at arch/x86/kernel/cpu/perf_event.c:995 x86_pmu_start+0x79/0xd4() It was caused by the following call sequence: perf_adjust_freq_unthr_context.part() { stop() if (delta > 0) { perf_adjust_period() { if (period > 8*...) { stop() ... start() } } } start() } Which caused a double start and a double stop, thus triggering the assert in x86_pmu_start(). The patch fixes the problem by avoiding the double calls. We pass a new argument to perf_adjust_period() to indicate whether or not the event is already stopped. We can't just remove the start/stop from that function because it's called from __perf_event_overflow where the event needs to be reloaded via a stop/start back-toback call. The patch reintroduces the assertion in x86_pmu_start() which was removed by commit: 84f2b9b ("perf: Remove deprecated WARN_ON_ONCE()") In this second version, we've added calls to disable/enable PMU during unthrottling or frequency adjustment based on bug report of spurious NMI interrupts from Eric Dumazet. Reported-and-tested-by: Eric Dumazet Signed-off-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: markus@trippelsdorf.de Cc: paulus@samba.org Link: http://lkml.kernel.org/r/20120207133956.GA4932@quad [ Minor edits to the changelog and to the code ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ kernel/events/core.c | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2a30e5ae6acf..5adce1040b11 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -986,6 +986,9 @@ static void x86_pmu_start(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx = event->hw.idx; + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + if (WARN_ON_ONCE(idx == -1)) return; diff --git a/kernel/events/core.c b/kernel/events/core.c index ba36013cfb21..1b5c081d8b9f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2303,7 +2303,7 @@ do { \ static DEFINE_PER_CPU(int, perf_throttled_count); static DEFINE_PER_CPU(u64, perf_throttled_seq); -static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable) { struct hw_perf_event *hwc = &event->hw; s64 period, sample_period; @@ -2322,9 +2322,13 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - event->pmu->stop(event, PERF_EF_UPDATE); + if (disable) + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); - event->pmu->start(event, PERF_EF_RELOAD); + + if (disable) + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -2350,6 +2354,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, return; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { if (event->state != PERF_EVENT_STATE_ACTIVE) @@ -2381,13 +2386,17 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, /* * restart the event * reload only if value has changed + * we have stopped the event so tell that + * to perf_adjust_period() to avoid stopping it + * twice. */ if (delta > 0) - perf_adjust_period(event, period, delta); + perf_adjust_period(event, period, delta, false); event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0); } + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -4562,7 +4571,7 @@ static int __perf_event_overflow(struct perf_event *event, hwc->freq_time_stamp = now; if (delta > 0 && delta < 2*TICK_NSEC) - perf_adjust_period(event, delta, hwc->last_period); + perf_adjust_period(event, delta, hwc->last_period, true); } /* -- cgit v1.2.3 From 1a2a4d06e1e95260c470ebe3a945f61bbe8c1fd8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 21 Dec 2011 12:17:03 -0800 Subject: security: create task_free security callback The current LSM interface to cred_free is not sufficient for allowing an LSM to track the life and death of a task. This patch adds the task_free hook so that an LSM can clean up resources on task death. Signed-off-by: Kees Cook Signed-off-by: James Morris --- include/linux/security.h | 9 +++++++++ kernel/fork.c | 1 + security/capability.c | 5 +++++ security/security.c | 5 +++++ 4 files changed, 20 insertions(+) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index 83c18e8c846d..8325eddd9ee4 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -651,6 +651,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. + * @task_free: + * @task task being freed + * Handle release of task-related resources. (Note that this can be called + * from interrupt context.) * @cred_alloc_blank: * @cred points to the credentials. * @gfp indicates the atomicity of any memory allocations. @@ -1493,6 +1497,7 @@ struct security_operations { int (*dentry_open) (struct file *file, const struct cred *cred); int (*task_create) (unsigned long clone_flags); + void (*task_free) (struct task_struct *task); int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp); void (*cred_free) (struct cred *cred); int (*cred_prepare)(struct cred *new, const struct cred *old, @@ -1752,6 +1757,7 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); +void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); @@ -2245,6 +2251,9 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } +static inline void security_task_free(struct task_struct *task) +{ } + static inline int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 1b2ef3c23ae4..f0e7781ba9b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -192,6 +192,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); diff --git a/security/capability.c b/security/capability.c index 2f680eb02b59..5bb21b1c448c 100644 --- a/security/capability.c +++ b/security/capability.c @@ -358,6 +358,10 @@ static int cap_task_create(unsigned long clone_flags) return 0; } +static void cap_task_free(struct task_struct *task) +{ +} + static int cap_cred_alloc_blank(struct cred *cred, gfp_t gfp) { return 0; @@ -954,6 +958,7 @@ void __init security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, file_receive); set_to_cap_if_null(ops, dentry_open); set_to_cap_if_null(ops, task_create); + set_to_cap_if_null(ops, task_free); set_to_cap_if_null(ops, cred_alloc_blank); set_to_cap_if_null(ops, cred_free); set_to_cap_if_null(ops, cred_prepare); diff --git a/security/security.c b/security/security.c index d7542493454d..7d9426bb7442 100644 --- a/security/security.c +++ b/security/security.c @@ -729,6 +729,11 @@ int security_task_create(unsigned long clone_flags) return security_ops->task_create(clone_flags); } +void security_task_free(struct task_struct *task) +{ + security_ops->task_free(task); +} + int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) { return security_ops->cred_alloc_blank(cred, gfp); -- cgit v1.2.3 From 8916e3702ec422b57cc549fbae3986106292100f Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Sat, 4 Feb 2012 22:26:13 +0100 Subject: PM / Suspend: Avoid code duplication in suspend statistics update The code if (error) { suspend_stats.fail++; dpm_save_failed_errno(error); } else suspend_stats.success++; Appears in the kernel/power/main.c and kernel/power/suspend.c. This patch just creates a new function to avoid duplicated code. Suggested-by: Srivatsa S. Bhat Signed-off-by: Marcos Paulo de Souza Acked-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- include/linux/suspend.h | 16 ++++++++++++++++ kernel/power/main.c | 6 +----- kernel/power/suspend.c | 6 +----- 3 files changed, 18 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index ac1c114c499d..b90191894441 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -94,6 +94,22 @@ static inline void dpm_save_failed_step(enum suspend_stat_step step) suspend_stats.last_failed_step %= REC_FAILED_NUM; } +/** + * suspend_stats_update - Update success/failure statistics of suspend-to-ram + * + * @error: Value returned by enter_state() function + */ +static inline void suspend_stats_update(int error) +{ + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; + } +} + + /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. diff --git a/kernel/power/main.c b/kernel/power/main.c index 8c5014a4e052..b1e324878d5f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -296,11 +296,7 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } if (state < PM_SUSPEND_MAX && *s) { error = enter_state(state); - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else - suspend_stats.success++; + suspend_stats_update(error); } #endif diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 560a639614a1..03bc92b42750 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -321,11 +321,7 @@ int pm_suspend(suspend_state_t state) int ret; if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { ret = enter_state(state); - if (ret) { - suspend_stats.fail++; - dpm_save_failed_errno(ret); - } else - suspend_stats.success++; + suspend_stats_update(ret); return ret; } return -EINVAL; -- cgit v1.2.3 From 51d6ff7acd920379f54d0be4dbe844a46178a65f Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 4 Feb 2012 22:26:38 +0100 Subject: PM / Hibernate: Thaw kernel threads in hibernation_snapshot() in error/test path In the hibernation call path, the kernel threads are frozen inside hibernation_snapshot(). If we happen to encounter an error further down the road or if we are exiting early due to a successful freezer test, then thaw kernel threads before returning to the caller. Signed-off-by: Srivatsa S. Bhat Acked-by: Tejun Heo Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 6 ++++-- kernel/power/user.c | 8 ++------ 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index a5d4cf0aa03e..c6dee739080c 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -343,13 +343,13 @@ int hibernation_snapshot(int platform_mode) * successful freezer test. */ freezer_test_done = true; - goto Cleanup; + goto Thaw; } error = dpm_prepare(PMSG_FREEZE); if (error) { dpm_complete(PMSG_RECOVER); - goto Cleanup; + goto Thaw; } suspend_console(); @@ -385,6 +385,8 @@ int hibernation_snapshot(int platform_mode) platform_end(platform_mode); return error; + Thaw: + thaw_kernel_threads(); Cleanup: swsusp_free(); goto Close; diff --git a/kernel/power/user.c b/kernel/power/user.c index 3e100075b13c..7bee91f9af51 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -249,16 +249,12 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); - if (error) { - thaw_kernel_threads(); - } else { + if (!error) { error = put_user(in_suspend, (int __user *)arg); if (!error && !freezer_test_done) data->ready = 1; - if (freezer_test_done) { + if (freezer_test_done) freezer_test_done = false; - thaw_kernel_threads(); - } } break; -- cgit v1.2.3 From a556d5b58345ccf51826b9ceac078072f830738b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Sat, 4 Feb 2012 23:39:56 +0100 Subject: PM / Hibernate: Refactor and simplify freezer_test_done The code related to 'freezer_test_done' is needlessly convoluted. Refactor the code and simplify the implementation. Signed-off-by: Srivatsa S. Bhat Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 10 +++++----- kernel/power/user.c | 6 ++---- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c6dee739080c..72baaf011fb7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -629,12 +629,8 @@ int hibernate(void) goto Finish; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); - if (error) - goto Thaw; - if (freezer_test_done) { - freezer_test_done = false; + if (error || freezer_test_done) goto Thaw; - } if (in_suspend) { unsigned int flags = 0; @@ -659,6 +655,10 @@ int hibernate(void) Thaw: thaw_processes(); + + /* Don't bother checking whether freezer_test_done is true */ + freezer_test_done = false; + Finish: free_basic_memory_bitmaps(); usermodehelper_enable(); diff --git a/kernel/power/user.c b/kernel/power/user.c index 7bee91f9af51..33c4329205af 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -251,10 +251,8 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = hibernation_snapshot(data->platform_support); if (!error) { error = put_user(in_suspend, (int __user *)arg); - if (!error && !freezer_test_done) - data->ready = 1; - if (freezer_test_done) - freezer_test_done = false; + data->ready = !freezer_test_done && !error; + freezer_test_done = false; } break; -- cgit v1.2.3 From f6302f1bcd75a042df69866d98b8d775a668f8f1 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 10 Feb 2012 09:03:58 +0100 Subject: relay: prevent integer overflow in relay_open() "subbuf_size" and "n_subbufs" come from the user and they need to be capped to prevent an integer overflow. Signed-off-by: Dan Carpenter Cc: stable@kernel.org Signed-off-by: Jens Axboe --- kernel/relay.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 4335e1d7ee2d..ab56a1764d4d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -164,10 +164,14 @@ depopulate: */ static struct rchan_buf *relay_create_buf(struct rchan *chan) { - struct rchan_buf *buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); - if (!buf) + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) return NULL; + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); if (!buf->padding) goto free_buf; @@ -574,6 +578,8 @@ struct rchan *relay_open(const char *base_filename, if (!(subbuf_size && n_subbufs)) return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) -- cgit v1.2.3 From 86f5e6a7b192721995ece919985ac75222402351 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vázquez Cao Date: Thu, 9 Feb 2012 17:42:22 -0500 Subject: watchdog: Fix code/comments mismatches Reflect the change in the soft and hard lockup thresholds and their relation to the frequency of the hrtimer and NMI events in the code comments. While at it, remove references to files that do not exist anymore. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Don Zickus Link: http://lkml.kernel.org/r/1328827342-6253-3-git-send-email-dzickus@redhat.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d117262deba3..14bc092fb12c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -3,12 +3,9 @@ * * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. * - * this code detects hard lockups: incidents in where on a CPU - * the kernel does not respond to anything except NMI. - * - * Note: Most of this code is borrowed heavily from softlockup.c, - * so thanks to Ingo for the initial implementation. - * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * Note: Most of this code is borrowed heavily from the original softlockup + * detector, so thanks to Ingo for the initial implementation. + * Some chunks also taken from the old x86-specific nmi watchdog code, thanks * to those contributors as well. */ @@ -117,9 +114,10 @@ static unsigned long get_sample_period(void) { /* * convert watchdog_thresh from seconds to ns - * the divide by 5 is to give hrtimer 5 chances to - * increment before the hardlockup detector generates - * a warning + * the divide by 5 is to give hrtimer several chances (two + * or three with the current relation between the soft + * and hard thresholds) to increment before the + * hardlockup detector generates a warning */ return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } @@ -336,9 +334,11 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); /* - * Run briefly once per second to reset the softlockup timestamp. - * If this gets delayed for more than 60 seconds then the - * debug-printout triggers in watchdog_timer_fn(). + * Run briefly (kicked by the hrtimer callback function) once every + * get_sample_period() seconds (4 seconds by default) to reset the + * softlockup timestamp. If this gets delayed for more than + * 2*watchdog_thresh seconds then the debug-printout triggers in + * watchdog_timer_fn(). */ while (!kthread_should_stop()) { __touch_watchdog(); -- cgit v1.2.3 From 1e42e83fde5537266c1d1e7fd8c010b3028d50fc Mon Sep 17 00:00:00 2001 From: Geunsik Lim Date: Wed, 8 Feb 2012 19:05:36 +0900 Subject: ftrace: sched_switch plugin is deprecated Actually, sched_switch function tracer is merged into wakeup/wakeup_rt Update 'mini-HOWTO' for ftrace(Kernel function tracer). If we want to trace "sched:sched_switch" to trace sched_switch func, We may utilize event option.(e.g: trace-cmd list -e | grep sched) This patch is based on Linux-3.3.rc2-SMP-PREEMPT Link: http://lkml.kernel.org/r/1328695537-15081-1-git-send-email-geunsik.lim@gmail.com Cc: Randy Dunlap Signed-off-by: Geunsik Lim Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a3f1bc5d2a00..10d5503f0d04 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2764,12 +2764,12 @@ static const char readme_msg[] = "tracing mini-HOWTO:\n\n" "# mount -t debugfs nodev /sys/kernel/debug\n\n" "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" + "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" "nop\n" - "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" + "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" "# cat /sys/kernel/debug/tracing/current_tracer\n" - "sched_switch\n" + "wakeup\n" "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" -- cgit v1.2.3 From a9b542ee607a8afafa9447292394959fc84ea650 Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Mon, 13 Feb 2012 16:23:42 +0100 Subject: PM / QoS: unconditionally build the feature The PM QoS feature originally didn't depend on CONFIG_PM, which was mistakenly changed by commit e8db0be1245de16a6cc6365506abc392c3c212d4 PM QoS: Move and rename the implementation files Later, commit d020283dc694c9ec31b410f522252f7a8397e67d PM / QoS: CPU C-state breakage with PM Qos change partially fixed that by introducing a static inline definition of pm_qos_request(), but that still didn't allow user space to use the PM QoS interface if CONFIG_PM was unset (which had been possible before). For this reason, remove the dependency of PM QoS on CONFIG_PM to make it work (as intended) with CONFIG_PM unset. [rjw: Replaced the original changelog with a new one.] Signed-off-by: Jean Pihet Reported-by: Venkatesh Pallipadi Signed-off-by: Rafael J. Wysocki --- include/linux/pm_qos.h | 41 +---------------------------------------- kernel/power/Makefile | 3 ++- 2 files changed, 3 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 67c521731f41..c8a541e13380 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -67,7 +67,6 @@ static inline int dev_pm_qos_request_active(struct dev_pm_qos_request *req) return req->dev != 0; } -#ifdef CONFIG_PM int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, enum pm_qos_req_action action, int value); void pm_qos_add_request(struct pm_qos_request *req, int pm_qos_class, @@ -82,6 +81,7 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier); int pm_qos_request_active(struct pm_qos_request *req); s32 pm_qos_read_value(struct pm_qos_constraints *c); +#ifdef CONFIG_PM s32 __dev_pm_qos_read_value(struct device *dev); s32 dev_pm_qos_read_value(struct device *dev); int dev_pm_qos_add_request(struct device *dev, struct dev_pm_qos_request *req, @@ -99,45 +99,6 @@ void dev_pm_qos_constraints_destroy(struct device *dev); int dev_pm_qos_add_ancestor_request(struct device *dev, struct dev_pm_qos_request *req, s32 value); #else -static inline int pm_qos_update_target(struct pm_qos_constraints *c, - struct plist_node *node, - enum pm_qos_req_action action, - int value) - { return 0; } -static inline void pm_qos_add_request(struct pm_qos_request *req, - int pm_qos_class, s32 value) - { return; } -static inline void pm_qos_update_request(struct pm_qos_request *req, - s32 new_value) - { return; } -static inline void pm_qos_remove_request(struct pm_qos_request *req) - { return; } - -static inline int pm_qos_request(int pm_qos_class) -{ - switch (pm_qos_class) { - case PM_QOS_CPU_DMA_LATENCY: - return PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE; - case PM_QOS_NETWORK_LATENCY: - return PM_QOS_NETWORK_LAT_DEFAULT_VALUE; - case PM_QOS_NETWORK_THROUGHPUT: - return PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE; - default: - return PM_QOS_DEFAULT_VALUE; - } -} - -static inline int pm_qos_add_notifier(int pm_qos_class, - struct notifier_block *notifier) - { return 0; } -static inline int pm_qos_remove_notifier(int pm_qos_class, - struct notifier_block *notifier) - { return 0; } -static inline int pm_qos_request_active(struct pm_qos_request *req) - { return 0; } -static inline s32 pm_qos_read_value(struct pm_qos_constraints *c) - { return 0; } - static inline s32 __dev_pm_qos_read_value(struct device *dev) { return 0; } static inline s32 dev_pm_qos_read_value(struct device *dev) diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 07e0e28ffba7..66d808ec5252 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,7 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o qos.o +obj-y += qos.o +obj-$(CONFIG_PM) += main.o obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o -- cgit v1.2.3 From 6c83b4818dd65eb17e633b6b629a81da7bed90b3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Feb 2012 00:00:34 +0100 Subject: PM / Sleep: Do not check wakeup too often in try_to_freeze_tasks() Use the observation that it is more efficient to check the wakeup variable once before the loop reporting tasks that were not frozen in try_to_freeze_tasks() than to do that in every step of that loop. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 7e426459e60a..6aeb5efe00eb 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -98,13 +98,15 @@ static int try_to_freeze_tasks(bool user_only) elapsed_csecs / 100, elapsed_csecs % 100, todo - wq_busy, wq_busy); - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if (!wakeup && !freezer_should_skip(p) && - p != current && freezing(p) && !frozen(p)) - sched_show_task(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + if (!wakeup) { + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p != current && !freezer_should_skip(p) + && freezing(p) && !frozen(p)) + sched_show_task(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } } else { printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, elapsed_csecs % 100); -- cgit v1.2.3 From 6f585f750d792652f33b6e85b1ee205be4b5e572 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 11 Feb 2012 22:40:23 +0100 Subject: PM / Sleep: Remove unnecessary label from suspend_freeze_processes() The Finish label in suspend_freeze_processes() is in fact unnecessary and makes the function look more complicated than it really is, so remove that label (along with a few empty lines). Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/power.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/power.h b/kernel/power/power.h index 21724eee5206..398d42b48e9e 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -234,16 +234,14 @@ static inline int suspend_freeze_processes(void) int error; error = freeze_processes(); - /* * freeze_processes() automatically thaws every task if freezing * fails. So we need not do anything extra upon error. */ if (error) - goto Finish; + return error; error = freeze_kernel_threads(); - /* * freeze_kernel_threads() thaws only kernel threads upon freezing * failure. So we have to thaw the userspace tasks ourselves. @@ -251,7 +249,6 @@ static inline int suspend_freeze_processes(void) if (error) thaw_processes(); - Finish: return error; } -- cgit v1.2.3 From 95100358491abaa2e9a5483811370059bbca4645 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 24 Nov 2011 20:03:08 +0100 Subject: printk/tracing: Add console output tracing Add a printk.console trace point to record any printk messages into the trace, regardless of the current console loglevel. This can help correlate (existing) printk debugging with other tracing. Link: http://lkml.kernel.org/r/1322161388.5366.54.camel@jlt3.sipsolutions.net Acked-by: Frederic Weisbecker Cc: Christoph Hellwig Cc: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Thomas Gleixner Signed-off-by: Johannes Berg Signed-off-by: Steven Rostedt --- include/trace/events/printk.h | 41 +++++++++++++++++++++++++++++++++++++++++ kernel/printk.c | 5 +++++ 2 files changed, 46 insertions(+) create mode 100644 include/trace/events/printk.h (limited to 'kernel') diff --git a/include/trace/events/printk.h b/include/trace/events/printk.h new file mode 100644 index 000000000000..94ec79cc011a --- /dev/null +++ b/include/trace/events/printk.h @@ -0,0 +1,41 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM printk + +#if !defined(_TRACE_PRINTK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PRINTK_H + +#include + +TRACE_EVENT_CONDITION(console, + TP_PROTO(const char *log_buf, unsigned start, unsigned end, + unsigned log_buf_len), + + TP_ARGS(log_buf, start, end, log_buf_len), + + TP_CONDITION(start != end), + + TP_STRUCT__entry( + __dynamic_array(char, msg, end - start + 1) + ), + + TP_fast_assign( + if ((start & (log_buf_len - 1)) > (end & (log_buf_len - 1))) { + memcpy(__get_dynamic_array(msg), + log_buf + (start & (log_buf_len - 1)), + log_buf_len - (start & (log_buf_len - 1))); + memcpy((char *)__get_dynamic_array(msg) + + log_buf_len - (start & (log_buf_len - 1)), + log_buf, end & (log_buf_len - 1)); + } else + memcpy(__get_dynamic_array(msg), + log_buf + (start & (log_buf_len - 1)), + end - start); + ((char *)__get_dynamic_array(msg))[end - start] = 0; + ), + + TP_printk("%s", __get_str(msg)) +); +#endif /* _TRACE_PRINTK_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..cb8a6bd697c6 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -44,6 +44,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Architectures can override it: */ @@ -542,6 +545,8 @@ MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" static void _call_console_drivers(unsigned start, unsigned end, int msg_log_level) { + trace_console(&LOG_BUF(0), start, end, log_buf_len); + if ((msg_log_level < console_loglevel || ignore_loglevel) && console_drivers && start != end) { if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { -- cgit v1.2.3 From 47b0edcb599ea6eb9ef16d3a08932a0e01485293 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Tue, 29 Nov 2011 22:08:00 +0100 Subject: tracing/trivial: Use kcalloc instead of kzalloc to allocate array The advantage of kcalloc is, that will prevent integer overflows which could result from the multiplication of number of elements and size and it is also a bit nicer to read. The semantic patch that makes this change is available in https://lkml.org/lkml/2011/11/25/107 Link: http://lkml.kernel.org/r/1322600880.1534.347.camel@localhost.localdomain Signed-off-by: Thomas Meyer Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_events_filter.c | 7 +++---- kernel/trace/trace_syscalls.c | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index e2e0597c0845..d1499e910fe8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1129,7 +1129,7 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits) return NULL; size = 1 << size_bits; - hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + hash->buckets = kcalloc(size, sizeof(*hash->buckets), GFP_KERNEL); if (!hash->buckets) { kfree(hash); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 24aee7127451..76afaee99dbc 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -685,7 +685,7 @@ find_event_field(struct ftrace_event_call *call, char *name) static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { - stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); if (!stack->preds) return -ENOMEM; stack->index = n_preds; @@ -826,8 +826,7 @@ static int __alloc_preds(struct event_filter *filter, int n_preds) if (filter->preds) __free_preds(filter); - filter->preds = - kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + filter->preds = kcalloc(n_preds, sizeof(*filter->preds), GFP_KERNEL); if (!filter->preds) return -ENOMEM; @@ -1486,7 +1485,7 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) children = count_leafs(preds, &preds[root->left]); children += count_leafs(preds, &preds[root->right]); - root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + root->ops = kcalloc(children, sizeof(*root->ops), GFP_KERNEL); if (!root->ops) return -ENOMEM; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cb654542c1a1..43500153dd1e 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -468,8 +468,8 @@ int __init init_ftrace_syscalls(void) unsigned long addr; int i; - syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * - NR_syscalls, GFP_KERNEL); + syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), + GFP_KERNEL); if (!syscalls_metadata) { WARN_ON(1); return -ENOMEM; -- cgit v1.2.3 From 191c542442fdf53cc3c496c00be13367fd9cd42d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Feb 2012 03:58:52 +0000 Subject: mm: collapse security_vm_enough_memory() variants into a single function Collapse security_vm_enough_memory() variants into a single function. Signed-off-by: Al Viro Signed-off-by: James Morris --- include/linux/security.h | 16 ---------------- kernel/fork.c | 2 +- mm/mmap.c | 4 ++-- mm/mprotect.c | 2 +- mm/mremap.c | 2 +- mm/shmem.c | 4 ++-- mm/swapfile.c | 4 +++- security/security.c | 14 -------------- 8 files changed, 10 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/security.h b/include/linux/security.h index 8325eddd9ee4..2fefad6d27a0 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1679,9 +1679,7 @@ int security_quotactl(int cmds, int type, int id, struct super_block *sb); int security_quota_on(struct dentry *dentry); int security_syslog(int type); int security_settime(const struct timespec *ts, const struct timezone *tz); -int security_vm_enough_memory(long pages); int security_vm_enough_memory_mm(struct mm_struct *mm, long pages); -int security_vm_enough_memory_kern(long pages); int security_bprm_set_creds(struct linux_binprm *bprm); int security_bprm_check(struct linux_binprm *bprm); void security_bprm_committing_creds(struct linux_binprm *bprm); @@ -1902,25 +1900,11 @@ static inline int security_settime(const struct timespec *ts, return cap_settime(ts, tz); } -static inline int security_vm_enough_memory(long pages) -{ - WARN_ON(current->mm == NULL); - return cap_vm_enough_memory(current->mm, pages); -} - static inline int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { - WARN_ON(mm == NULL); return cap_vm_enough_memory(mm, pages); } -static inline int security_vm_enough_memory_kern(long pages) -{ - /* If current->mm is a kernel thread then we will pass NULL, - for this specific case that is fine */ - return cap_vm_enough_memory(current->mm, pages); -} - static inline int security_bprm_set_creds(struct linux_binprm *bprm) { return cap_bprm_set_creds(bprm); diff --git a/kernel/fork.c b/kernel/fork.c index f0e7781ba9b4..d5ebddf317a9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -355,7 +355,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) charge = 0; if (mpnt->vm_flags & VM_ACCOUNT) { unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - if (security_vm_enough_memory(len)) + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } diff --git a/mm/mmap.c b/mm/mmap.c index 3f758c7f4c81..db05495d6d0a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1235,7 +1235,7 @@ munmap_back: */ if (accountable_mapping(file, vm_flags)) { charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) + if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; vm_flags |= VM_ACCOUNT; } @@ -2169,7 +2169,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (mm->map_count > sysctl_max_map_count) return -ENOMEM; - if (security_vm_enough_memory(len >> PAGE_SHIFT)) + if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ diff --git a/mm/mprotect.c b/mm/mprotect.c index 5a688a2756be..9599fa2d0e92 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -168,7 +168,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB| VM_SHARED|VM_NORESERVE))) { charged = nrpages; - if (security_vm_enough_memory(charged)) + if (security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; newflags |= VM_ACCOUNT; } diff --git a/mm/mremap.c b/mm/mremap.c index 87bb8393e7d2..db8d983b5a7d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -329,7 +329,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, if (vma->vm_flags & VM_ACCOUNT) { unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) + if (security_vm_enough_memory_mm(mm, charged)) goto Efault; *p = charged; } diff --git a/mm/shmem.c b/mm/shmem.c index 269d049294ab..d9c293952755 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -127,7 +127,7 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) static inline int shmem_acct_size(unsigned long flags, loff_t size) { return (flags & VM_NORESERVE) ? - 0 : security_vm_enough_memory_kern(VM_ACCT(size)); + 0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size)); } static inline void shmem_unacct_size(unsigned long flags, loff_t size) @@ -145,7 +145,7 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) static inline int shmem_acct_block(unsigned long flags) { return (flags & VM_NORESERVE) ? - security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE)) : 0; + security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0; } static inline void shmem_unacct_blocks(unsigned long flags, long pages) diff --git a/mm/swapfile.c b/mm/swapfile.c index d999f090dfda..f0d79296dd55 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1563,6 +1563,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + BUG_ON(!current->mm); + pathname = getname(specialfile); err = PTR_ERR(pathname); if (IS_ERR(pathname)) @@ -1590,7 +1592,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (!security_vm_enough_memory(p->pages)) + if (!security_vm_enough_memory_mm(current->mm, p->pages)) vm_unacct_memory(p->pages); else { err = -ENOMEM; diff --git a/security/security.c b/security/security.c index 7d9426bb7442..44177add4713 100644 --- a/security/security.c +++ b/security/security.c @@ -187,25 +187,11 @@ int security_settime(const struct timespec *ts, const struct timezone *tz) return security_ops->settime(ts, tz); } -int security_vm_enough_memory(long pages) -{ - WARN_ON(current->mm == NULL); - return security_ops->vm_enough_memory(current->mm, pages); -} - int security_vm_enough_memory_mm(struct mm_struct *mm, long pages) { - WARN_ON(mm == NULL); return security_ops->vm_enough_memory(mm, pages); } -int security_vm_enough_memory_kern(long pages) -{ - /* If current->mm is a kernel thread then we will pass NULL, - for this specific case that is fine */ - return security_ops->vm_enough_memory(current->mm, pages); -} - int security_bprm_set_creds(struct linux_binprm *bprm) { return security_ops->bprm_set_creds(bprm); -- cgit v1.2.3 From 4040153087478993cbf0809f444400a3c808074c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 13 Feb 2012 03:58:52 +0000 Subject: security: trim security.h Trim security.h Signed-off-by: Al Viro Signed-off-by: James Morris --- drivers/net/macvtap.c | 1 + drivers/target/iscsi/iscsi_target.c | 1 + drivers/target/iscsi/iscsi_target_login.c | 1 + fs/nfs/client.c | 1 + fs/proc/proc_sysctl.c | 2 ++ fs/quota/dquot.c | 1 + fs/super.c | 1 + include/linux/security.h | 55 ++++++++++++++++--------------- include/net/sock.h | 2 ++ ipc/msgutil.c | 2 ++ kernel/cred.c | 1 + kernel/exit.c | 1 + kernel/sched/core.c | 1 + kernel/sysctl.c | 1 + mm/mmap.c | 13 ++++++++ security/commoncap.c | 1 + security/security.c | 2 ++ security/selinux/hooks.c | 2 ++ security/smack/smack_lsm.c | 3 ++ 19 files changed, 66 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 58dc117a8d78..0427c6561c84 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 44262908def5..33df66d91aad 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 38cb7ce8469e..1ee33a8c3fab 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 31778f74357d..d4f772ebd1ef 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index a6b62173d4c3..67bbf6e4e197 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -6,7 +6,9 @@ #include #include #include +#include #include +#include #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 46741970371b..8b4f12b33f57 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/super.c b/fs/super.c index 6015c02296b7..18660532909e 100644 --- a/fs/super.c +++ b/fs/super.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" diff --git a/include/linux/security.h b/include/linux/security.h index 2fefad6d27a0..339b3b120f6c 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -22,22 +22,36 @@ #ifndef __LINUX_SECURITY_H #define __LINUX_SECURITY_H -#include -#include -#include -#include -#include -#include -#include -#include -#include /* PAGE_ALIGN */ -#include -#include #include -#include +#include #include -#include -#include +#include + +struct linux_binprm; +struct cred; +struct rlimit; +struct siginfo; +struct sem_array; +struct sembuf; +struct kern_ipc_perm; +struct audit_context; +struct super_block; +struct inode; +struct dentry; +struct file; +struct vfsmount; +struct path; +struct qstr; +struct nameidata; +struct iattr; +struct fown_struct; +struct file_operations; +struct shmid_kernel; +struct msg_msg; +struct msg_queue; +struct xattr; +struct xfrm_sec_ctx; +struct mm_struct; /* Maximum number of letters for an LSM name string */ #define SECURITY_NAME_MAX 10 @@ -49,6 +63,7 @@ struct ctl_table; struct audit_krule; struct user_namespace; +struct timezone; /* * These functions are in security/capability.c and are used @@ -131,18 +146,6 @@ struct request_sock; #define LSM_UNSAFE_PTRACE_CAP 4 #ifdef CONFIG_MMU -/* - * If a hint addr is less than mmap_min_addr change hint to be as - * low as possible but still greater than mmap_min_addr - */ -static inline unsigned long round_hint_to_min(unsigned long hint) -{ - hint &= PAGE_MASK; - if (((void *)hint != NULL) && - (hint < mmap_min_addr)) - return PAGE_ALIGN(mmap_min_addr); - return hint; -} extern int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf020..27508f07eada 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -56,6 +56,8 @@ #include #include #include +#include +#include #include #include diff --git a/ipc/msgutil.c b/ipc/msgutil.c index 5652101cdac0..26143d377c95 100644 --- a/ipc/msgutil.c +++ b/ipc/msgutil.c @@ -13,7 +13,9 @@ #include #include #include +#include #include +#include #include #include "util.h" diff --git a/kernel/cred.c b/kernel/cred.c index 5791612a4045..97b36eeca4c9 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #if 0 diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..5ad867a3685e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..78682bfb3405 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f487f257e05e..11d53046b905 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include diff --git a/mm/mmap.c b/mm/mmap.c index db05495d6d0a..694a8625ab0d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -935,6 +935,19 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, } #endif /* CONFIG_PROC_FS */ +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ diff --git a/security/commoncap.c b/security/commoncap.c index 7ce191ea29a0..0cf4b53480a7 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -28,6 +28,7 @@ #include #include #include +#include /* * If a non-root user executes a setuid-root binary in diff --git a/security/security.c b/security/security.c index 44177add4713..bf619ffc9a4d 100644 --- a/security/security.c +++ b/security/security.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #define MAX_LSM_EVM_XATTR 2 diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6a3683e28426..304929909375 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -81,6 +81,8 @@ #include #include #include +#include +#include #include "avc.h" #include "objsec.h" diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index e8af5b0ba80f..cd667b4089a5 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -36,6 +36,9 @@ #include #include #include +#include +#include +#include #include "smack.h" #define task_security(task) (task_cred_xxx((task), security)) -- cgit v1.2.3 From 10f296cbfe3b93188c41463fd7a53808ebdbcbe3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:11 +0800 Subject: module: make module param bint handle nul value Allow bint param accept nul values, just do same as bool param. Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Rusty Russell --- kernel/params.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 32ee04308285..4bc965d8a1fe 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -97,7 +97,8 @@ static int parse_one(char *param, for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool) + if (!val && params[i].ops->set != param_set_bool + && params[i].ops->set != param_set_bint) return -EINVAL; pr_debug("They are equal! Calling %p\n", params[i].ops->set); -- cgit v1.2.3 From 074b85175a43a23fdbde60f55feea636e0bf0f85 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Wed, 8 Feb 2012 12:39:07 -0800 Subject: vfs: fix panic in __d_lookup() with high dentry hashtable counts When the number of dentry cache hash table entries gets too high (2147483648 entries), as happens by default on a 16TB system, use of a signed integer in the dcache_init() initialization loop prevents the dentry_hashtable from getting initialized, causing a panic in __d_lookup(). Fix this in dcache_init() and similar areas. Signed-off-by: Dimitri Sivanich Acked-by: David S. Miller Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 8 ++++---- kernel/pid.c | 4 ++-- mm/page_alloc.c | 1 + net/ipv4/tcp.c | 5 +++-- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index 16a53cc2cc02..fe19ac13f75f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2968,7 +2968,7 @@ __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -2986,13 +2986,13 @@ static void __init dcache_init_early(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } static void __init dcache_init(void) { - int loop; + unsigned int loop; /* * A constructor could be added for stable state like the lists, @@ -3016,7 +3016,7 @@ static void __init dcache_init(void) &d_hash_mask, 0); - for (loop = 0; loop < (1 << d_hash_shift); loop++) + for (loop = 0; loop < (1U << d_hash_shift); loop++) INIT_HLIST_BL_HEAD(dentry_hashtable + loop); } diff --git a/fs/inode.c b/fs/inode.c index fb10d86ffad7..d3ebdbe723d0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1651,7 +1651,7 @@ __setup("ihash_entries=", set_ihash_entries); */ void __init inode_init_early(void) { - int loop; + unsigned int loop; /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. @@ -1669,13 +1669,13 @@ void __init inode_init_early(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } void __init inode_init(void) { - int loop; + unsigned int loop; /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", @@ -1699,7 +1699,7 @@ void __init inode_init(void) &i_hash_mask, 0); - for (loop = 0; loop < (1 << i_hash_shift); loop++) + for (loop = 0; loop < (1U << i_hash_shift); loop++) INIT_HLIST_HEAD(&inode_hashtable[loop]); } diff --git a/kernel/pid.c b/kernel/pid.c index ce8e00deaccb..9f08dfabaf13 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -543,12 +543,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) */ void __init pidhash_init(void) { - int i, pidhash_size; + unsigned int i, pidhash_size; pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, HASH_EARLY | HASH_SMALL, &pidhash_shift, NULL, 4096); - pidhash_size = 1 << pidhash_shift; + pidhash_size = 1U << pidhash_shift; for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2186ecb36f7..a13ded1938f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5236,6 +5236,7 @@ void *__init alloc_large_system_hash(const char *tablename, max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4; do_div(max, bucketsize); } + max = min(max, 0x80000000ULL); if (numentries > max) numentries = max; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 37755ccc0e96..22ef5f9fd2ff 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3240,7 +3240,8 @@ void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3283,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); -- cgit v1.2.3 From e1964c50a83d1ce53731c88271d12ac92292a880 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:48 -0700 Subject: irq_domain: Be less verbose irq_domain printk's too much. Drop some output. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Thomas Gleixner Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 1f9e26526b69..cc2cd43ec740 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -170,13 +170,11 @@ void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { struct device_node *node; - pr_info("looking for phys_base=%llx, irq_start=%i\n", + pr_debug("looking for phys_base=%llx, irq_start=%i\n", (unsigned long long) phys_base, (int) irq_start); node = of_find_matching_node_by_address(NULL, match, phys_base); if (node) irq_domain_add_simple(node, irq_start); - else - pr_info("no node found\n"); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif /* CONFIG_OF_IRQ */ -- cgit v1.2.3 From 7bb69bade0d41715bdf1b24f5ef0b8f798769fe9 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:48 -0700 Subject: irq_domain: Make irq_domain structure match powerpc's irq_host Part of the series to unify the irq remapping mechanisms in the kernel. A follow up patch will copy the powerpc implementation into kernel/irq/irqdomain.c, which will be a lot easier if the structures are identical. Where they differ, I've chose to use the powerpc names since there is a lot more code using those names. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- arch/arm/common/gic.c | 14 ++++---- include/linux/irqdomain.h | 84 ++++++++++++++++++++++++++++++++++++----------- kernel/irq/irqdomain.c | 14 ++++---- 3 files changed, 78 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c index c47d6199b784..dc19862be0a8 100644 --- a/arch/arm/common/gic.c +++ b/arch/arm/common/gic.c @@ -619,10 +619,10 @@ static void __init gic_pm_init(struct gic_chip_data *gic) #endif #ifdef CONFIG_OF -static int gic_irq_domain_dt_translate(struct irq_domain *d, - struct device_node *controller, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type) +static int gic_irq_domain_xlate(struct irq_domain *d, + struct device_node *controller, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) { if (d->of_node != controller) return -EINVAL; @@ -641,9 +641,9 @@ static int gic_irq_domain_dt_translate(struct irq_domain *d, } #endif -const struct irq_domain_ops gic_irq_domain_ops = { +struct irq_domain_ops gic_irq_domain_ops = { #ifdef CONFIG_OF - .dt_translate = gic_irq_domain_dt_translate, + .xlate = gic_irq_domain_xlate, #endif }; @@ -721,7 +721,7 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start, irq_start); domain->irq_base = irq_start; } - domain->priv = gic; + domain->host_data = gic; domain->ops = &gic_irq_domain_ops; irq_domain_add(domain); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bd4272b61a14..35b9ff382e45 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -9,61 +9,105 @@ * representation into a hardware irq number that can be mapped back to a * Linux irq number without any extra platform support code. * - * irq_domain is expected to be embedded in an interrupt controller's private - * data structure. + * Interrupt controller "domain" data structure. This could be defined as a + * irq domain controller. That is, it handles the mapping between hardware + * and virtual interrupt numbers for a given interrupt domain. The domain + * structure is generally created by the PIC code for a given PIC instance + * (though a domain can cover more than one PIC if they have a flat number + * model). It's the domain callbacks that are responsible for setting the + * irq_chip on a given irq_desc after it's been mapped. */ + #ifndef _LINUX_IRQDOMAIN_H #define _LINUX_IRQDOMAIN_H -#include -#include +#include +#include -#ifdef CONFIG_IRQ_DOMAIN struct device_node; struct irq_domain; +struct of_device_id; + +/* This type is the placeholder for a hardware interrupt number. It has to + * be big enough to enclose whatever representation is used by a given + * platform. + */ +typedef unsigned long irq_hw_number_t; /** * struct irq_domain_ops - Methods for irq_domain objects + * @match: Match an interrupt controller device node to a host, returns + * 1 on a match + * @map: Create or update a mapping between a virtual irq number and a hw + * irq number. This is called only once for a given mapping. + * @unmap: Dispose of such a mapping * @to_irq: (optional) given a local hardware irq number, return the linux * irq number. If to_irq is not implemented, then the irq_domain * will use this translation: irq = (domain->irq_base + hwirq) - * @dt_translate: Given a device tree node and interrupt specifier, decode - * the hardware irq number and linux irq type value. + * @xlate: Given a device tree node and interrupt specifier, decode + * the hardware irq number and linux irq type value. + * + * Functions below are provided by the driver and called whenever a new mapping + * is created or an old mapping is disposed. The driver can then proceed to + * whatever internal data structures management is required. It also needs + * to setup the irq_desc when returning from map(). */ struct irq_domain_ops { + int (*match)(struct irq_domain *d, struct device_node *node); + int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw); + void (*unmap)(struct irq_domain *d, unsigned int virq); unsigned int (*to_irq)(struct irq_domain *d, unsigned long hwirq); - -#ifdef CONFIG_OF - int (*dt_translate)(struct irq_domain *d, struct device_node *node, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type); -#endif /* CONFIG_OF */ + int (*xlate)(struct irq_domain *d, struct device_node *node, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type); }; /** * struct irq_domain - Hardware interrupt number translation object - * @list: Element in global irq_domain list. + * @link: Element in global irq_domain list. + * @revmap_type: Method used for reverse mapping hwirq numbers to linux irq. This + * will be one of the IRQ_DOMAIN_MAP_* values. + * @revmap_data: Revmap method specific data. + * @ops: pointer to irq_domain methods + * @host_data: private data pointer for use by owner. Not touched by irq_domain + * core code. * @irq_base: Start of irq_desc range assigned to the irq_domain. The creator * of the irq_domain is responsible for allocating the array of * irq_desc structures. * @nr_irq: Number of irqs managed by the irq domain * @hwirq_base: Starting number for hwirqs managed by the irq domain - * @ops: pointer to irq_domain methods - * @priv: private data pointer for use by owner. Not touched by irq_domain - * core code. * @of_node: (optional) Pointer to device tree nodes associated with the * irq_domain. Used when decoding device tree interrupt specifiers. */ struct irq_domain { - struct list_head list; + struct list_head link; + + /* type of reverse mapping_technique */ + unsigned int revmap_type; +#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ + union { + struct { + unsigned int size; + unsigned int *revmap; + } linear; + struct radix_tree_root tree; + } revmap_data; + struct irq_domain_ops *ops; + void *host_data; + irq_hw_number_t inval_irq; + unsigned int irq_base; unsigned int nr_irq; unsigned int hwirq_base; - const struct irq_domain_ops *ops; - void *priv; + + /* Optional device node pointer */ struct device_node *of_node; }; +#ifdef CONFIG_IRQ_DOMAIN /** * irq_domain_to_irq() - Translate from a hardware irq to a linux irq number * diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cc2cd43ec740..509adb8762d7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -43,7 +43,7 @@ void irq_domain_add(struct irq_domain *domain) } mutex_lock(&irq_domain_mutex); - list_add(&domain->list, &irq_domain_list); + list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); } @@ -57,7 +57,7 @@ void irq_domain_del(struct irq_domain *domain) int hwirq, irq; mutex_lock(&irq_domain_mutex); - list_del(&domain->list); + list_del(&domain->link); mutex_unlock(&irq_domain_mutex); /* Clear the irq_domain assignments */ @@ -88,10 +88,10 @@ unsigned int irq_create_of_mapping(struct device_node *controller, /* Find a domain which can translate the irq spec */ mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, list) { - if (!domain->ops->dt_translate) + list_for_each_entry(domain, &irq_domain_list, link) { + if (!domain->ops->xlate) continue; - rc = domain->ops->dt_translate(domain, controller, + rc = domain->ops->xlate(domain, controller, intspec, intsize, &hwirq, &type); if (rc == 0) break; @@ -126,7 +126,7 @@ void irq_dispose_mapping(unsigned int irq) } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -int irq_domain_simple_dt_translate(struct irq_domain *d, +int irq_domain_simple_xlate(struct irq_domain *d, struct device_node *controller, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type) @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(irq_domain_generate_simple); struct irq_domain_ops irq_domain_simple_ops = { #ifdef CONFIG_OF_IRQ - .dt_translate = irq_domain_simple_dt_translate, + .xlate = irq_domain_simple_xlate, #endif /* CONFIG_OF_IRQ */ }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -- cgit v1.2.3 From ac5637611150281f398bb7a47e3fcb69a09e7803 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 7 Feb 2012 17:58:03 +0100 Subject: genirq: Unmask oneshot irqs when thread was not woken When the primary handler of an interrupt which is marked IRQ_ONESHOT returns IRQ_HANDLED or IRQ_NONE, then the interrupt thread is not woken and the unmask logic of the interrupt line is never invoked. This keeps the interrupt masked forever. This was not noticed as most IRQ_ONESHOT users wake the thread unconditionally (usually because they cannot access the underlying device from hard interrupt context). Though this behaviour was nowhere documented and not necessarily intentional. Some drivers can avoid the thread wakeup in certain cases and run into the situation where the interrupt line s kept masked. Handle it gracefully. Reported-and-tested-by: Lothar Wassmann Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..b742edc0bdd4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -330,6 +330,24 @@ out_unlock: } EXPORT_SYMBOL_GPL(handle_simple_irq); +/* + * Called unconditionally from handle_level_irq() and only for oneshot + * interrupts from handle_fasteoi_irq() + */ +static void cond_unmask_irq(struct irq_desc *desc) +{ + /* + * We need to unmask in the following cases: + * - Standard level irq (IRQF_ONESHOT is not set) + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) + unmask_irq(desc); +} + /** * handle_level_irq - Level type irq handler * @irq: the interrupt number @@ -362,8 +380,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) handle_irq_event(desc); - if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT)) - unmask_irq(desc); + cond_unmask_irq(desc); + out_unlock: raw_spin_unlock(&desc->lock); } @@ -417,6 +435,9 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) preflow_handler(desc); handle_irq_event(desc); + if (desc->istate & IRQS_ONESHOT) + cond_unmask_irq(desc); + out_eoi: desc->irq_data.chip->irq_eoi(&desc->irq_data); out_unlock: -- cgit v1.2.3 From b4bc724e82e80478cba5fe9825b62e71ddf78757 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Feb 2012 11:57:52 +0100 Subject: genirq: Handle pending irqs in irq_startup() An interrupt might be pending when irq_startup() is called, but the startup code does not invoke the resend logic. In some cases this prevents the device from issuing another interrupt which renders the device non functional. Call the resend function in irq_startup() to keep things going. Reported-and-tested-by: Russell King Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/autoprobe.c | 4 ++-- kernel/irq/chip.c | 17 ++++++++++------- kernel/irq/internals.h | 2 +- kernel/irq/manage.c | 2 +- 4 files changed, 14 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 342d8f44e401..0119b9d467ae 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -53,7 +53,7 @@ unsigned long probe_irq_on(void) if (desc->irq_data.chip->irq_set_type) desc->irq_data.chip->irq_set_type(&desc->irq_data, IRQ_TYPE_PROBE); - irq_startup(desc); + irq_startup(desc, false); } raw_spin_unlock_irq(&desc->lock); } @@ -70,7 +70,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc)) + if (irq_startup(desc, false)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b742edc0bdd4..fb7db75ee0c8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -157,19 +157,22 @@ static void irq_state_set_masked(struct irq_desc *desc) irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); } -int irq_startup(struct irq_desc *desc) +int irq_startup(struct irq_desc *desc, bool resend) { + int ret = 0; + irq_state_clr_disabled(desc); desc->depth = 0; if (desc->irq_data.chip->irq_startup) { - int ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); - return ret; + } else { + irq_enable(desc); } - - irq_enable(desc); - return 0; + if (resend) + check_irq_resend(desc, desc->irq_data.irq); + return ret; } void irq_shutdown(struct irq_desc *desc) @@ -646,7 +649,7 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, irq_settings_set_noprobe(desc); irq_settings_set_norequest(desc); irq_settings_set_nothread(desc); - irq_startup(desc); + irq_startup(desc, true); } out: irq_put_desc_busunlock(desc, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b7952316016a..40378ff877e7 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -67,7 +67,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); -extern int irq_startup(struct irq_desc *desc); +extern int irq_startup(struct irq_desc *desc, bool resend); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe49fea..32313c084442 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1027,7 +1027,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) desc->istate |= IRQS_ONESHOT; if (irq_settings_can_autoenable(desc)) - irq_startup(desc); + irq_startup(desc, true); else /* Undo nested disables: */ desc->depth = 1; -- cgit v1.2.3 From 59263b513c11398cd66a52d4c5b2b118ce1e0359 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Feb 2012 12:08:34 +0100 Subject: futex: Cover all PI opcodes with cmpxchg enabled check Some of the newer futex PI opcodes do not check the cmpxchg enabled variable and call unconditionally into the handling functions. Cover all PI opcodes in a separate check. Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Cc: Peter Zijlstra Cc: Darren Hart --- kernel/futex.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ea87f4d2f455..4b1c4b6a339c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2628,6 +2628,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, return -ENOSYS; } + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + if (!futex_cmpxchg_enabled) + return -ENOSYS; + } + switch (cmd) { case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; @@ -2649,16 +2659,13 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, val, timeout, 0); + ret = futex_lock_pi(uaddr, flags, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_unlock_pi(uaddr, flags); + ret = futex_unlock_pi(uaddr, flags); break; case FUTEX_TRYLOCK_PI: - if (futex_cmpxchg_enabled) - ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); + ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); break; case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; -- cgit v1.2.3 From 81b40539e748b108d143a5e38526ab00a6a784b6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Feb 2012 12:17:09 +0100 Subject: futex: Simplify return logic No need to assign ret in each case and break. Simply return the result of the handler function directly. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Darren Hart --- kernel/futex.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4b1c4b6a339c..2364c99dd982 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2616,7 +2616,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret = -ENOSYS, cmd = op & FUTEX_CMD_MASK; + int cmd = op & FUTEX_CMD_MASK; unsigned int flags = 0; if (!(op & FUTEX_PRIVATE_FLAG)) @@ -2642,43 +2642,31 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, case FUTEX_WAIT: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAIT_BITSET: - ret = futex_wait(uaddr, flags, val, timeout, val3); - break; + return futex_wait(uaddr, flags, val, timeout, val3); case FUTEX_WAKE: val3 = FUTEX_BITSET_MATCH_ANY; case FUTEX_WAKE_BITSET: - ret = futex_wake(uaddr, flags, val, val3); - break; + return futex_wake(uaddr, flags, val, val3); case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); - break; + return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); - break; + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); case FUTEX_WAKE_OP: - ret = futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); - break; + return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, flags, val, timeout, 0); - break; + return futex_lock_pi(uaddr, flags, val, timeout, 0); case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, flags); - break; + return futex_unlock_pi(uaddr, flags); case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, flags, 0, timeout, 1); - break; + return futex_lock_pi(uaddr, flags, 0, timeout, 1); case FUTEX_WAIT_REQUEUE_PI: val3 = FUTEX_BITSET_MATCH_ANY; - ret = futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, - uaddr2); - break; + return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); case FUTEX_CMP_REQUEUE_PI: - ret = futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); - break; - default: - ret = -ENOSYS; + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); } - return ret; + return -ENOSYS; } -- cgit v1.2.3 From 77b0d60c5adf39c74039e2142a1d3cd1e4d53799 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 4 Nov 2011 17:18:21 -0700 Subject: clockevents: Leave the broadcast device in shutdown mode when not needed Platforms with Always Running APIC Timer doesn't use the broadcast timer but the kernel is leaving the broadcast timer (HPET in this case) in oneshot mode. On these platforms, before the switch to oneshot mode, broadcast device is actually in shutdown mode. Code checks for empty tick_broadcast_mask and avoids going into the periodic mode. During switch to oneshot mode, add the same tick_broadcast_mask checks in the tick_broadcast_switch_to_oneshot() and avoid the broadcast device going into the oneshot mode. Signed-off-by: Suresh Siddha Cc: john stultz Cc: venki@google.com Link: http://lkml.kernel.org/r/1320452301.15071.16.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index fd4a7b1625a2..e883f57a3cd3 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -575,11 +575,15 @@ void tick_broadcast_switch_to_oneshot(void) unsigned long flags; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + if (cpumask_empty(tick_get_broadcast_mask())) + goto end; tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; bc = tick_broadcast_device.evtdev; if (bc) tick_broadcast_setup_oneshot(bc); + +end: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From 430ee8819553f66fe00e36f676a45886d76e7e8b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 1 Dec 2011 17:00:22 +0100 Subject: nohz: Remove update_ts_time_stat from tick_nohz_start_idle There is no reason to call update_ts_time_stat from tick_nohz_start_idle anymore (after e0e37c20 sched: Eliminate the ts->idle_lastupdate field) when we updated idle_lastupdate unconditionally. We haven't set idle_active yet and do not provide last_update_time so the whole call end up being just 2 wasted branches. Signed-off-by: Michal Hocko Cc: Arjan van de Ven Link: http://lkml.kernel.org/r/1322755222-6951-1-git-send-email-mhocko@suse.cz Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7656642e4b8e..8cfffd9d9ce9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -182,11 +182,7 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { - ktime_t now; - - now = ktime_get(); - - update_ts_time_stats(cpu, ts, now, NULL); + ktime_t now = ktime_get(); ts->idle_entrytime = now; ts->idle_active = 1; -- cgit v1.2.3 From 15f827be93928890bba965bc175caee50c4406d2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Jan 2012 18:59:43 +0100 Subject: nohz: Remove ts->Einidle checks before restarting the tick ts->inidle is set by tick_nohz_idle_enter() and unset by tick_nohz_idle_exit(). However these two calls are assumed to be always paired. This means that by the time we call tick_nohz_idle_exit(), ts->inidle is supposed to be always set to 1. Remove the checks for ts->inidle in tick_nohz_idle_exit(). This simplifies a bit the code and improves its debuggability (ie: ensure the call is paired with a tick_nohz_idle_enter() call). Signed-off-by: Frederic Weisbecker Reviewed-by: Yong Zhang Cc: Peter Zijlstra Cc: John Stultz Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1327427984-23282-2-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8cfffd9d9ce9..3526038f2836 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -558,20 +558,21 @@ void tick_nohz_idle_exit(void) local_irq_disable(); - if (ts->idle_active || (ts->inidle && ts->tick_stopped)) + WARN_ON_ONCE(!ts->inidle); + + ts->inidle = 0; + + if (ts->idle_active || ts->tick_stopped) now = ktime_get(); if (ts->idle_active) tick_nohz_stop_idle(cpu, now); - if (!ts->inidle || !ts->tick_stopped) { - ts->inidle = 0; + if (!ts->tick_stopped) { local_irq_enable(); return; } - ts->inidle = 0; - /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); -- cgit v1.2.3 From 0a8a2e78b7eece7c65884fcff9f98dc0fce89ee4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Jan 2012 18:59:44 +0100 Subject: timer: Fix bad idle check on irq entry idle_cpu() is called on irq entry to guess if we need to call tick_check_idle(). This way we can catch up with jiffies if the tick was stopped, stop accounting idle time during the interrupt and maintain the sched clock if it is unstable. But if we are going to exit the idle loop to schedule a new task (ie: if we have a task in the runqueue or a remotely enqueued ttwu to perform), the idle_cpu() check will return 0 such that we miss the call to tick_check_idle() for all interrupts happening before we schedule the new task. As a result these interrupts and the softirqs coming along may deal with stale jiffies values, bad sched clock values, and won't substract their time from the idle time accounting. Fix this with using is_idle_task() instead that strictly checks that we are running the idle task, without caring about the fact we are going to schedule a task soon. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: John Stultz Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1327427984-23282-3-git-send-email-fweisbec@gmail.com Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..5ace266bc0e6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -297,7 +297,7 @@ void irq_enter(void) int cpu = smp_processor_id(); rcu_irq_enter(); - if (idle_cpu(cpu) && !in_interrupt()) { + if (is_idle_task(current) && !in_interrupt()) { /* * Prevent raise_softirq from needlessly waking up ksoftirqd * here, as softirq will be serviced on return from interrupt. -- cgit v1.2.3 From cc79ca691c292e9fd44f589c7940b9654e22f2f6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 16 Feb 2012 01:37:49 -0700 Subject: irq_domain: Move irq_domain code from powerpc to kernel/irq This patch only moves the code. It doesn't make any changes, and the code is still only compiled for powerpc. Follow-on patches will generalize the code for other architectures. Signed-off-by: Grant Likely Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/irq.h | 144 ---------- arch/powerpc/kernel/irq.c | 502 ---------------------------------- include/linux/irqdomain.h | 46 +++- kernel/irq/irqdomain.c | 600 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 643 insertions(+), 650 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 1919634a9b32..303703d716fe 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,6 +135,7 @@ config PPC select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ select IRQ_PER_CPU + select IRQ_DOMAIN select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select IRQ_FORCED_THREADING diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index f80f262e0597..728cc30d04ea 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -42,154 +42,10 @@ extern atomic_t ppc_n_lost_interrupts; /* Same thing, used by the generic IRQ code */ #define NR_IRQS_LEGACY NUM_ISA_INTERRUPTS -/* - * The host code and data structures are fairly agnostic to the fact that - * we use an open firmware device-tree. We do have references to struct - * device_node in two places: in irq_find_host() to find the host matching - * a given interrupt controller node, and of course as an argument to its - * counterpart host->ops->match() callback. However, those are treated as - * generic pointers by the core and the fact that it's actually a device-node - * pointer is purely a convention between callers and implementation. This - * code could thus be used on other architectures by replacing those two - * by some sort of arch-specific void * "token" used to identify interrupt - * controllers. - */ - struct irq_data; extern irq_hw_number_t irqd_to_hwirq(struct irq_data *d); extern irq_hw_number_t virq_to_hw(unsigned int virq); -/** - * irq_alloc_host - Allocate a new irq_domain data structure - * @of_node: optional device-tree node of the interrupt controller - * @revmap_type: type of reverse mapping to use - * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map - * @ops: map/unmap host callbacks - * @inval_irq: provide a hw number in that host space that is always invalid - * - * Allocates and initialize and irq_domain structure. Note that in the case of - * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns - * for all legacy interrupts except 0 (which is always the invalid irq for - * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by - * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be allocated - * later during boot automatically (the reverse mapping will use the slow path - * until that happens). - */ -extern struct irq_domain *irq_alloc_host(struct device_node *of_node, - unsigned int revmap_type, - unsigned int revmap_arg, - struct irq_domain_ops *ops, - irq_hw_number_t inval_irq); - - -/** - * irq_find_host - Locates a host for a given device node - * @node: device-tree node of the interrupt controller - */ -extern struct irq_domain *irq_find_host(struct device_node *node); - - -/** - * irq_set_default_host - Set a "default" host - * @host: default host pointer - * - * For convenience, it's possible to set a "default" host that will be used - * whenever NULL is passed to irq_create_mapping(). It makes life easier for - * platforms that want to manipulate a few hard coded interrupt numbers that - * aren't properly represented in the device-tree. - */ -extern void irq_set_default_host(struct irq_domain *host); - - -/** - * irq_set_virq_count - Set the maximum number of virt irqs - * @count: number of linux virtual irqs, capped with NR_IRQS - * - * This is mainly for use by platforms like iSeries who want to program - * the virtual irq number in the controller to avoid the reverse mapping - */ -extern void irq_set_virq_count(unsigned int count); - - -/** - * irq_create_mapping - Map a hardware interrupt into linux virq space - * @host: host owning this hardware interrupt or NULL for default host - * @hwirq: hardware irq number in that host space - * - * Only one mapping per hardware interrupt is permitted. Returns a linux - * virq number. - * If the sense/trigger is to be specified, set_irq_type() should be called - * on the number returned from that call. - */ -extern unsigned int irq_create_mapping(struct irq_domain *host, - irq_hw_number_t hwirq); - - -/** - * irq_dispose_mapping - Unmap an interrupt - * @virq: linux virq number of the interrupt to unmap - */ -extern void irq_dispose_mapping(unsigned int virq); - -/** - * irq_find_mapping - Find a linux virq from an hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space - * - * This is a slow path, for use by generic code. It's expected that an - * irq controller implementation directly calls the appropriate low level - * mapping function. - */ -extern unsigned int irq_find_mapping(struct irq_domain *host, - irq_hw_number_t hwirq); - -/** - * irq_create_direct_mapping - Allocate a virq for direct mapping - * @host: host to allocate the virq for or NULL for default host - * - * This routine is used for irq controllers which can choose the hardware - * interrupt numbers they generate. In such a case it's simplest to use - * the linux virq as the hardware interrupt number. - */ -extern unsigned int irq_create_direct_mapping(struct irq_domain *host); - -/** - * irq_radix_revmap_insert - Insert a hw irq to linux virq number mapping. - * @host: host owning this hardware interrupt - * @virq: linux irq number - * @hwirq: hardware irq number in that host space - * - * This is for use by irq controllers that use a radix tree reverse - * mapping for fast lookup. - */ -extern void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, - irq_hw_number_t hwirq); - -/** - * irq_radix_revmap_lookup - Find a linux virq from a hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space - * - * This is a fast path, for use by irq controller code that uses radix tree - * revmaps - */ -extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host, - irq_hw_number_t hwirq); - -/** - * irq_linear_revmap - Find a linux virq from a hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space - * - * This is a fast path, for use by irq controller code that uses linear - * revmaps. It does fallback to the slow path if the revmap doesn't exist - * yet and will create the revmap entry with appropriate locking - */ - -extern unsigned int irq_linear_revmap(struct irq_domain *host, - irq_hw_number_t hwirq); - - /** * irq_early_init - Init irq remapping subsystem */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 269fbd5ac62f..e3673ff6b7a0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -486,17 +486,6 @@ void do_softirq(void) local_irq_restore(flags); } - -/* - * IRQ controller and virtual interrupts - */ - -static LIST_HEAD(irq_domain_list); -static DEFINE_MUTEX(irq_domain_mutex); -static DEFINE_MUTEX(revmap_trees_mutex); -static unsigned int irq_virq_count = NR_IRQS; -static struct irq_domain *irq_default_host; - irq_hw_number_t irqd_to_hwirq(struct irq_data *d) { return d->hwirq; @@ -510,362 +499,6 @@ irq_hw_number_t virq_to_hw(unsigned int virq) } EXPORT_SYMBOL_GPL(virq_to_hw); -static int default_irq_host_match(struct irq_domain *h, struct device_node *np) -{ - return h->of_node != NULL && h->of_node == np; -} - -struct irq_domain *irq_alloc_host(struct device_node *of_node, - unsigned int revmap_type, - unsigned int revmap_arg, - struct irq_domain_ops *ops, - irq_hw_number_t inval_irq) -{ - struct irq_domain *host, *h; - unsigned int size = sizeof(struct irq_domain); - unsigned int i; - unsigned int *rmap; - - /* Allocate structure and revmap table if using linear mapping */ - if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) - size += revmap_arg * sizeof(unsigned int); - host = kzalloc(size, GFP_KERNEL); - if (host == NULL) - return NULL; - - /* Fill structure */ - host->revmap_type = revmap_type; - host->inval_irq = inval_irq; - host->ops = ops; - host->of_node = of_node_get(of_node); - - if (host->ops->match == NULL) - host->ops->match = default_irq_host_match; - - mutex_lock(&irq_domain_mutex); - /* Make sure only one legacy controller can be created */ - if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) { - list_for_each_entry(h, &irq_domain_list, link) { - if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { - mutex_unlock(&irq_domain_mutex); - of_node_put(host->of_node); - kfree(host); - return NULL; - } - } - } - list_add(&host->link, &irq_domain_list); - mutex_unlock(&irq_domain_mutex); - - /* Additional setups per revmap type */ - switch(revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* 0 is always the invalid number for legacy */ - host->inval_irq = 0; - /* setup us as the host for all legacy interrupts */ - for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { - struct irq_data *irq_data = irq_get_irq_data(i); - irq_data->hwirq = i; - irq_data->domain = host; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - ops->map(host, i, i); - - /* Clear norequest flags */ - irq_clear_status_flags(i, IRQ_NOREQUEST); - } - break; - case IRQ_DOMAIN_MAP_LINEAR: - rmap = (unsigned int *)(host + 1); - for (i = 0; i < revmap_arg; i++) - rmap[i] = NO_IRQ; - host->revmap_data.linear.size = revmap_arg; - host->revmap_data.linear.revmap = rmap; - break; - case IRQ_DOMAIN_MAP_TREE: - INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL); - break; - default: - break; - } - - pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host); - - return host; -} - -struct irq_domain *irq_find_host(struct device_node *node) -{ - struct irq_domain *h, *found = NULL; - - /* We might want to match the legacy controller last since - * it might potentially be set to match all interrupts in - * the absence of a device node. This isn't a problem so far - * yet though... - */ - mutex_lock(&irq_domain_mutex); - list_for_each_entry(h, &irq_domain_list, link) - if (h->ops->match(h, node)) { - found = h; - break; - } - mutex_unlock(&irq_domain_mutex); - return found; -} -EXPORT_SYMBOL_GPL(irq_find_host); - -void irq_set_default_host(struct irq_domain *host) -{ - pr_debug("irq: Default host set to @0x%p\n", host); - - irq_default_host = host; -} - -void irq_set_virq_count(unsigned int count) -{ - pr_debug("irq: Trying to set virq count to %d\n", count); - - BUG_ON(count < NUM_ISA_INTERRUPTS); - if (count < NR_IRQS) - irq_virq_count = count; -} - -static int irq_setup_virq(struct irq_domain *host, unsigned int virq, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data = irq_get_irq_data(virq); - - irq_data->hwirq = hwirq; - irq_data->domain = host; - if (host->ops->map(host, virq, hwirq)) { - pr_debug("irq: -> mapping failed, freeing\n"); - irq_data->domain = NULL; - irq_data->hwirq = 0; - return -1; - } - - irq_clear_status_flags(virq, IRQ_NOREQUEST); - - return 0; -} - -unsigned int irq_create_direct_mapping(struct irq_domain *host) -{ - unsigned int virq; - - if (host == NULL) - host = irq_default_host; - - BUG_ON(host == NULL); - WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); - - virq = irq_alloc_desc_from(1, 0); - if (virq == NO_IRQ) { - pr_debug("irq: create_direct virq allocation failed\n"); - return NO_IRQ; - } - if (virq >= irq_virq_count) { - pr_err("ERROR: no free irqs available below %i maximum\n", - irq_virq_count); - irq_free_desc(virq); - return 0; - } - - pr_debug("irq: create_direct obtained virq %d\n", virq); - - if (irq_setup_virq(host, virq, virq)) { - irq_free_desc(virq); - return NO_IRQ; - } - - return virq; -} - -unsigned int irq_create_mapping(struct irq_domain *host, - irq_hw_number_t hwirq) -{ - unsigned int virq, hint; - - pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq); - - /* Look for default host if nececssary */ - if (host == NULL) - host = irq_default_host; - if (host == NULL) { - printk(KERN_WARNING "irq_create_mapping called for" - " NULL host, hwirq=%lx\n", hwirq); - WARN_ON(1); - return NO_IRQ; - } - pr_debug("irq: -> using host @%p\n", host); - - /* Check if mapping already exists */ - virq = irq_find_mapping(host, hwirq); - if (virq != NO_IRQ) { - pr_debug("irq: -> existing mapping on virq %d\n", virq); - return virq; - } - - /* Get a virtual interrupt number */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { - /* Handle legacy */ - virq = (unsigned int)hwirq; - if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) - return NO_IRQ; - return virq; - } else { - /* Allocate a virtual interrupt number */ - hint = hwirq % irq_virq_count; - if (hint == 0) - hint = 1; - virq = irq_alloc_desc_from(hint, 0); - if (!virq) - virq = irq_alloc_desc_from(1, 0); - if (virq == NO_IRQ) { - pr_debug("irq: -> virq allocation failed\n"); - return NO_IRQ; - } - } - - if (irq_setup_virq(host, virq, hwirq)) { - if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) - irq_free_desc(virq); - return NO_IRQ; - } - - pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", - hwirq, host->of_node ? host->of_node->full_name : "null", virq); - - return virq; -} -EXPORT_SYMBOL_GPL(irq_create_mapping); - -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *host; - irq_hw_number_t hwirq; - unsigned int type = IRQ_TYPE_NONE; - unsigned int virq; - - if (controller == NULL) - host = irq_default_host; - else - host = irq_find_host(controller); - if (host == NULL) { - printk(KERN_WARNING "irq: no irq host found for %s !\n", - controller->full_name); - return NO_IRQ; - } - - /* If host has no translation, then we assume interrupt line */ - if (host->ops->xlate == NULL) - hwirq = intspec[0]; - else { - if (host->ops->xlate(host, controller, intspec, intsize, - &hwirq, &type)) - return NO_IRQ; - } - - /* Create mapping */ - virq = irq_create_mapping(host, hwirq); - if (virq == NO_IRQ) - return virq; - - /* Set type if specified and different than the current one */ - if (type != IRQ_TYPE_NONE && - type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) - irq_set_irq_type(virq, type); - return virq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - -void irq_dispose_mapping(unsigned int virq) -{ - struct irq_data *irq_data = irq_get_irq_data(virq); - struct irq_domain *host; - irq_hw_number_t hwirq; - - if (virq == NO_IRQ || !irq_data) - return; - - host = irq_data->domain; - if (WARN_ON(host == NULL)) - return; - - /* Never unmap legacy interrupts */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return; - - irq_set_status_flags(virq, IRQ_NOREQUEST); - - /* remove chip and handler */ - irq_set_chip_and_handler(virq, NULL, NULL); - - /* Make sure it's completed */ - synchronize_irq(virq); - - /* Tell the PIC about it */ - if (host->ops->unmap) - host->ops->unmap(host, virq); - smp_mb(); - - /* Clear reverse map */ - hwirq = irq_data->hwirq; - switch(host->revmap_type) { - case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < host->revmap_data.linear.size) - host->revmap_data.linear.revmap[hwirq] = NO_IRQ; - break; - case IRQ_DOMAIN_MAP_TREE: - mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&host->revmap_data.tree, hwirq); - mutex_unlock(&revmap_trees_mutex); - break; - } - - /* Destroy map */ - irq_data->hwirq = host->inval_irq; - - irq_free_desc(virq); -} -EXPORT_SYMBOL_GPL(irq_dispose_mapping); - -unsigned int irq_find_mapping(struct irq_domain *host, - irq_hw_number_t hwirq) -{ - unsigned int i; - unsigned int hint = hwirq % irq_virq_count; - - /* Look for default host if nececssary */ - if (host == NULL) - host = irq_default_host; - if (host == NULL) - return NO_IRQ; - - /* legacy -> bail early */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return hwirq; - - /* Slow path does a linear search of the map */ - if (hint == 0) - hint = 1; - i = hint; - do { - struct irq_data *data = irq_get_irq_data(i); - if (data && (data->domain == host) && (data->hwirq == hwirq)) - return i; - i++; - if (i >= irq_virq_count) - i = 1; - } while(i != hint); - return NO_IRQ; -} -EXPORT_SYMBOL_GPL(irq_find_mapping); - #ifdef CONFIG_SMP int irq_choose_cpu(const struct cpumask *mask) { @@ -902,146 +535,11 @@ int irq_choose_cpu(const struct cpumask *mask) } #endif -unsigned int irq_radix_revmap_lookup(struct irq_domain *host, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data; - - if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return irq_find_mapping(host, hwirq); - - /* - * Freeing an irq can delete nodes along the path to - * do the lookup via call_rcu. - */ - rcu_read_lock(); - irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq); - rcu_read_unlock(); - - /* - * If found in radix tree, then fine. - * Else fallback to linear lookup - this should not happen in practice - * as it means that we failed to insert the node in the radix tree. - */ - return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq); -} - -void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, - irq_hw_number_t hwirq) -{ - struct irq_data *irq_data = irq_get_irq_data(virq); - - if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return; - - if (virq != NO_IRQ) { - mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); - mutex_unlock(&revmap_trees_mutex); - } -} - -unsigned int irq_linear_revmap(struct irq_domain *host, - irq_hw_number_t hwirq) -{ - unsigned int *revmap; - - if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) - return irq_find_mapping(host, hwirq); - - /* Check revmap bounds */ - if (unlikely(hwirq >= host->revmap_data.linear.size)) - return irq_find_mapping(host, hwirq); - - /* Check if revmap was allocated */ - revmap = host->revmap_data.linear.revmap; - if (unlikely(revmap == NULL)) - return irq_find_mapping(host, hwirq); - - /* Fill up revmap with slow path if no mapping found */ - if (unlikely(revmap[hwirq] == NO_IRQ)) - revmap[hwirq] = irq_find_mapping(host, hwirq); - - return revmap[hwirq]; -} - int arch_early_irq_init(void) { return 0; } -#ifdef CONFIG_VIRQ_DEBUG -static int virq_debug_show(struct seq_file *m, void *private) -{ - unsigned long flags; - struct irq_desc *desc; - const char *p; - static const char none[] = "none"; - void *data; - int i; - - seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", - "chip name", "chip data", "host name"); - - for (i = 1; i < nr_irqs; i++) { - desc = irq_to_desc(i); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - - if (desc->action && desc->action->handler) { - struct irq_chip *chip; - - seq_printf(m, "%5d ", i); - seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); - - chip = irq_desc_get_chip(desc); - if (chip && chip->name) - p = chip->name; - else - p = none; - seq_printf(m, "%-15s ", p); - - data = irq_desc_get_chip_data(desc); - seq_printf(m, "0x%16p ", data); - - if (desc->irq_data.domain->of_node) - p = desc->irq_data.domain->of_node->full_name; - else - p = none; - seq_printf(m, "%s\n", p); - } - - raw_spin_unlock_irqrestore(&desc->lock, flags); - } - - return 0; -} - -static int virq_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, virq_debug_show, inode->i_private); -} - -static const struct file_operations virq_debug_fops = { - .open = virq_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init irq_debugfs_init(void) -{ - if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, - NULL, &virq_debug_fops) == NULL) - return -ENOMEM; - - return 0; -} -__initcall(irq_debugfs_init); -#endif /* CONFIG_VIRQ_DEBUG */ - #ifdef CONFIG_PPC64 static int __init setup_noirqdistrib(char *str) { diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 35b9ff382e45..18f4ab002d2e 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -16,6 +16,17 @@ * (though a domain can cover more than one PIC if they have a flat number * model). It's the domain callbacks that are responsible for setting the * irq_chip on a given irq_desc after it's been mapped. + * + * The host code and data structures are agnostic to whether or not + * we use an open firmware device-tree. We do have references to struct + * device_node in two places: in irq_find_host() to find the host matching + * a given interrupt controller node, and of course as an argument to its + * counterpart domain->ops->match() callback. However, those are treated as + * generic pointers by the core and the fact that it's actually a device-node + * pointer is purely a convention between callers and implementation. This + * code could thus be used on other architectures by replacing those two + * by some sort of arch-specific void * "token" used to identify interrupt + * controllers. */ #ifndef _LINUX_IRQDOMAIN_H @@ -108,6 +119,32 @@ struct irq_domain { }; #ifdef CONFIG_IRQ_DOMAIN +#ifdef CONFIG_PPC +extern struct irq_domain *irq_alloc_host(struct device_node *of_node, + unsigned int revmap_type, + unsigned int revmap_arg, + struct irq_domain_ops *ops, + irq_hw_number_t inval_irq); +extern struct irq_domain *irq_find_host(struct device_node *node); +extern void irq_set_default_host(struct irq_domain *host); +extern void irq_set_virq_count(unsigned int count); + + +extern unsigned int irq_create_mapping(struct irq_domain *host, + irq_hw_number_t hwirq); +extern void irq_dispose_mapping(unsigned int virq); +extern unsigned int irq_find_mapping(struct irq_domain *host, + irq_hw_number_t hwirq); +extern unsigned int irq_create_direct_mapping(struct irq_domain *host); +extern void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, + irq_hw_number_t hwirq); +extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host, + irq_hw_number_t hwirq); +extern unsigned int irq_linear_revmap(struct irq_domain *host, + irq_hw_number_t hwirq); + +#else /* CONFIG_PPC */ + /** * irq_domain_to_irq() - Translate from a hardware irq to a linux irq number * @@ -137,15 +174,16 @@ extern void irq_domain_add(struct irq_domain *domain); extern void irq_domain_del(struct irq_domain *domain); extern struct irq_domain_ops irq_domain_simple_ops; -#endif /* CONFIG_IRQ_DOMAIN */ -#if defined(CONFIG_IRQ_DOMAIN) && defined(CONFIG_OF_IRQ) +#if defined(CONFIG_OF_IRQ) extern void irq_domain_add_simple(struct device_node *controller, int irq_base); extern void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start); -#else /* CONFIG_IRQ_DOMAIN && CONFIG_OF_IRQ */ +#else /* CONFIG_OF_IRQ */ static inline void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { } -#endif /* CONFIG_IRQ_DOMAIN && CONFIG_OF_IRQ */ +#endif /* !CONFIG_OF_IRQ */ +#endif /* !CONFIG_PPC */ +#endif /* CONFIG_IRQ_DOMAIN */ #endif /* _LINUX_IRQDOMAIN_H */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 509adb8762d7..f551bc1d3167 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1,14 +1,612 @@ +#include +#include +#include #include +#include #include #include #include #include #include +#include #include +#include +#include static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); +#ifdef CONFIG_PPC +static DEFINE_MUTEX(revmap_trees_mutex); +static unsigned int irq_virq_count = NR_IRQS; +static struct irq_domain *irq_default_host; + +static int default_irq_host_match(struct irq_domain *h, struct device_node *np) +{ + return h->of_node != NULL && h->of_node == np; +} + +/** + * irq_alloc_host() - Allocate a new irq_domain data structure + * @of_node: optional device-tree node of the interrupt controller + * @revmap_type: type of reverse mapping to use + * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map + * @ops: map/unmap host callbacks + * @inval_irq: provide a hw number in that host space that is always invalid + * + * Allocates and initialize and irq_domain structure. Note that in the case of + * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by + * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be + * allocated later during boot automatically (the reverse mapping will use the + * slow path until that happens). + */ +struct irq_domain *irq_alloc_host(struct device_node *of_node, + unsigned int revmap_type, + unsigned int revmap_arg, + struct irq_domain_ops *ops, + irq_hw_number_t inval_irq) +{ + struct irq_domain *host, *h; + unsigned int size = sizeof(struct irq_domain); + unsigned int i; + unsigned int *rmap; + + /* Allocate structure and revmap table if using linear mapping */ + if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) + size += revmap_arg * sizeof(unsigned int); + host = kzalloc(size, GFP_KERNEL); + if (host == NULL) + return NULL; + + /* Fill structure */ + host->revmap_type = revmap_type; + host->inval_irq = inval_irq; + host->ops = ops; + host->of_node = of_node_get(of_node); + + if (host->ops->match == NULL) + host->ops->match = default_irq_host_match; + + mutex_lock(&irq_domain_mutex); + /* Make sure only one legacy controller can be created */ + if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) { + list_for_each_entry(h, &irq_domain_list, link) { + if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { + mutex_unlock(&irq_domain_mutex); + of_node_put(host->of_node); + kfree(host); + return NULL; + } + } + } + list_add(&host->link, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); + + /* Additional setups per revmap type */ + switch(revmap_type) { + case IRQ_DOMAIN_MAP_LEGACY: + /* 0 is always the invalid number for legacy */ + host->inval_irq = 0; + /* setup us as the host for all legacy interrupts */ + for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { + struct irq_data *irq_data = irq_get_irq_data(i); + irq_data->hwirq = i; + irq_data->domain = host; + + /* Legacy flags are left to default at this point, + * one can then use irq_create_mapping() to + * explicitly change them + */ + ops->map(host, i, i); + + /* Clear norequest flags */ + irq_clear_status_flags(i, IRQ_NOREQUEST); + } + break; + case IRQ_DOMAIN_MAP_LINEAR: + rmap = (unsigned int *)(host + 1); + for (i = 0; i < revmap_arg; i++) + rmap[i] = NO_IRQ; + host->revmap_data.linear.size = revmap_arg; + host->revmap_data.linear.revmap = rmap; + break; + case IRQ_DOMAIN_MAP_TREE: + INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL); + break; + default: + break; + } + + pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host); + + return host; +} + +/** + * irq_find_host() - Locates a domain for a given device node + * @node: device-tree node of the interrupt controller + */ +struct irq_domain *irq_find_host(struct device_node *node) +{ + struct irq_domain *h, *found = NULL; + + /* We might want to match the legacy controller last since + * it might potentially be set to match all interrupts in + * the absence of a device node. This isn't a problem so far + * yet though... + */ + mutex_lock(&irq_domain_mutex); + list_for_each_entry(h, &irq_domain_list, link) + if (h->ops->match(h, node)) { + found = h; + break; + } + mutex_unlock(&irq_domain_mutex); + return found; +} +EXPORT_SYMBOL_GPL(irq_find_host); + +/** + * irq_set_default_host() - Set a "default" irq domain + * @host: default host pointer + * + * For convenience, it's possible to set a "default" domain that will be used + * whenever NULL is passed to irq_create_mapping(). It makes life easier for + * platforms that want to manipulate a few hard coded interrupt numbers that + * aren't properly represented in the device-tree. + */ +void irq_set_default_host(struct irq_domain *host) +{ + pr_debug("irq: Default host set to @0x%p\n", host); + + irq_default_host = host; +} + +/** + * irq_set_virq_count() - Set the maximum number of linux irqs + * @count: number of linux irqs, capped with NR_IRQS + * + * This is mainly for use by platforms like iSeries who want to program + * the virtual irq number in the controller to avoid the reverse mapping + */ +void irq_set_virq_count(unsigned int count) +{ + pr_debug("irq: Trying to set virq count to %d\n", count); + + BUG_ON(count < NUM_ISA_INTERRUPTS); + if (count < NR_IRQS) + irq_virq_count = count; +} + +static int irq_setup_virq(struct irq_domain *host, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + irq_data->hwirq = hwirq; + irq_data->domain = host; + if (host->ops->map(host, virq, hwirq)) { + pr_debug("irq: -> mapping failed, freeing\n"); + irq_data->domain = NULL; + irq_data->hwirq = 0; + return -1; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); + + return 0; +} + +/** + * irq_create_direct_mapping() - Allocate an irq for direct mapping + * @host: domain to allocate the irq for or NULL for default host + * + * This routine is used for irq controllers which can choose the hardware + * interrupt numbers they generate. In such a case it's simplest to use + * the linux irq as the hardware interrupt number. + */ +unsigned int irq_create_direct_mapping(struct irq_domain *host) +{ + unsigned int virq; + + if (host == NULL) + host = irq_default_host; + + BUG_ON(host == NULL); + WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + + virq = irq_alloc_desc_from(1, 0); + if (virq == NO_IRQ) { + pr_debug("irq: create_direct virq allocation failed\n"); + return NO_IRQ; + } + if (virq >= irq_virq_count) { + pr_err("ERROR: no free irqs available below %i maximum\n", + irq_virq_count); + irq_free_desc(virq); + return 0; + } + + pr_debug("irq: create_direct obtained virq %d\n", virq); + + if (irq_setup_virq(host, virq, virq)) { + irq_free_desc(virq); + return NO_IRQ; + } + + return virq; +} + +/** + * irq_create_mapping() - Map a hardware interrupt into linux irq space + * @host: host owning this hardware interrupt or NULL for default host + * @hwirq: hardware irq number in that host space + * + * Only one mapping per hardware interrupt is permitted. Returns a linux + * irq number. + * If the sense/trigger is to be specified, set_irq_type() should be called + * on the number returned from that call. + */ +unsigned int irq_create_mapping(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + unsigned int virq, hint; + + pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq); + + /* Look for default host if nececssary */ + if (host == NULL) + host = irq_default_host; + if (host == NULL) { + printk(KERN_WARNING "irq_create_mapping called for" + " NULL host, hwirq=%lx\n", hwirq); + WARN_ON(1); + return NO_IRQ; + } + pr_debug("irq: -> using host @%p\n", host); + + /* Check if mapping already exists */ + virq = irq_find_mapping(host, hwirq); + if (virq != NO_IRQ) { + pr_debug("irq: -> existing mapping on virq %d\n", virq); + return virq; + } + + /* Get a virtual interrupt number */ + if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { + /* Handle legacy */ + virq = (unsigned int)hwirq; + if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) + return NO_IRQ; + return virq; + } else { + /* Allocate a virtual interrupt number */ + hint = hwirq % irq_virq_count; + if (hint == 0) + hint++; + virq = irq_alloc_desc_from(hint, 0); + if (!virq) + virq = irq_alloc_desc_from(1, 0); + if (virq == NO_IRQ) { + pr_debug("irq: -> virq allocation failed\n"); + return NO_IRQ; + } + } + + if (irq_setup_virq(host, virq, hwirq)) { + if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) + irq_free_desc(virq); + return NO_IRQ; + } + + pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", + hwirq, host->of_node ? host->of_node->full_name : "null", virq); + + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_mapping); + +unsigned int irq_create_of_mapping(struct device_node *controller, + const u32 *intspec, unsigned int intsize) +{ + struct irq_domain *host; + irq_hw_number_t hwirq; + unsigned int type = IRQ_TYPE_NONE; + unsigned int virq; + + if (controller == NULL) + host = irq_default_host; + else + host = irq_find_host(controller); + if (host == NULL) { + printk(KERN_WARNING "irq: no irq host found for %s !\n", + controller->full_name); + return NO_IRQ; + } + + /* If host has no translation, then we assume interrupt line */ + if (host->ops->xlate == NULL) + hwirq = intspec[0]; + else { + if (host->ops->xlate(host, controller, intspec, intsize, + &hwirq, &type)) + return NO_IRQ; + } + + /* Create mapping */ + virq = irq_create_mapping(host, hwirq); + if (virq == NO_IRQ) + return virq; + + /* Set type if specified and different than the current one */ + if (type != IRQ_TYPE_NONE && + type != (irqd_get_trigger_type(irq_get_irq_data(virq)))) + irq_set_irq_type(virq, type); + return virq; +} +EXPORT_SYMBOL_GPL(irq_create_of_mapping); + +/** + * irq_dispose_mapping() - Unmap an interrupt + * @virq: linux irq number of the interrupt to unmap + */ +void irq_dispose_mapping(unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + struct irq_domain *host; + irq_hw_number_t hwirq; + + if (virq == NO_IRQ || !irq_data) + return; + + host = irq_data->domain; + if (WARN_ON(host == NULL)) + return; + + /* Never unmap legacy interrupts */ + if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + + /* remove chip and handler */ + irq_set_chip_and_handler(virq, NULL, NULL); + + /* Make sure it's completed */ + synchronize_irq(virq); + + /* Tell the PIC about it */ + if (host->ops->unmap) + host->ops->unmap(host, virq); + smp_mb(); + + /* Clear reverse map */ + hwirq = irq_data->hwirq; + switch(host->revmap_type) { + case IRQ_DOMAIN_MAP_LINEAR: + if (hwirq < host->revmap_data.linear.size) + host->revmap_data.linear.revmap[hwirq] = NO_IRQ; + break; + case IRQ_DOMAIN_MAP_TREE: + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&host->revmap_data.tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + break; + } + + /* Destroy map */ + irq_data->hwirq = host->inval_irq; + + irq_free_desc(virq); +} +EXPORT_SYMBOL_GPL(irq_dispose_mapping); + +/** + * irq_find_mapping() - Find a linux irq from an hw irq number. + * @host: domain owning this hardware interrupt + * @hwirq: hardware irq number in that host space + * + * This is a slow path, for use by generic code. It's expected that an + * irq controller implementation directly calls the appropriate low level + * mapping function. + */ +unsigned int irq_find_mapping(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + unsigned int i; + unsigned int hint = hwirq % irq_virq_count; + + /* Look for default host if nececssary */ + if (host == NULL) + host = irq_default_host; + if (host == NULL) + return NO_IRQ; + + /* legacy -> bail early */ + if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return hwirq; + + /* Slow path does a linear search of the map */ + if (hint == 0) + hint = 1; + i = hint; + do { + struct irq_data *data = irq_get_irq_data(i); + if (data && (data->domain == host) && (data->hwirq == hwirq)) + return i; + i++; + if (i >= irq_virq_count) + i = 1; + } while(i != hint); + return NO_IRQ; +} +EXPORT_SYMBOL_GPL(irq_find_mapping); + +/** + * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. + * @host: host owning this hardware interrupt + * @hwirq: hardware irq number in that host space + * + * This is a fast path, for use by irq controller code that uses radix tree + * revmaps + */ +unsigned int irq_radix_revmap_lookup(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data; + + if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return irq_find_mapping(host, hwirq); + + /* + * Freeing an irq can delete nodes along the path to + * do the lookup via call_rcu. + */ + rcu_read_lock(); + irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq); + rcu_read_unlock(); + + /* + * If found in radix tree, then fine. + * Else fallback to linear lookup - this should not happen in practice + * as it means that we failed to insert the node in the radix tree. + */ + return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq); +} + +/** + * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. + * @host: host owning this hardware interrupt + * @virq: linux irq number + * @hwirq: hardware irq number in that host space + * + * This is for use by irq controllers that use a radix tree reverse + * mapping for fast lookup. + */ +void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, + irq_hw_number_t hwirq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return; + + if (virq != NO_IRQ) { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); + mutex_unlock(&revmap_trees_mutex); + } +} + +/** + * irq_linear_revmap() - Find a linux irq from a hw irq number. + * @host: host owning this hardware interrupt + * @hwirq: hardware irq number in that host space + * + * This is a fast path, for use by irq controller code that uses linear + * revmaps. It does fallback to the slow path if the revmap doesn't exist + * yet and will create the revmap entry with appropriate locking + */ +unsigned int irq_linear_revmap(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + unsigned int *revmap; + + if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) + return irq_find_mapping(host, hwirq); + + /* Check revmap bounds */ + if (unlikely(hwirq >= host->revmap_data.linear.size)) + return irq_find_mapping(host, hwirq); + + /* Check if revmap was allocated */ + revmap = host->revmap_data.linear.revmap; + if (unlikely(revmap == NULL)) + return irq_find_mapping(host, hwirq); + + /* Fill up revmap with slow path if no mapping found */ + if (unlikely(revmap[hwirq] == NO_IRQ)) + revmap[hwirq] = irq_find_mapping(host, hwirq); + + return revmap[hwirq]; +} + +#ifdef CONFIG_VIRQ_DEBUG +static int virq_debug_show(struct seq_file *m, void *private) +{ + unsigned long flags; + struct irq_desc *desc; + const char *p; + static const char none[] = "none"; + void *data; + int i; + + seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", + "chip name", "chip data", "host name"); + + for (i = 1; i < nr_irqs; i++) { + desc = irq_to_desc(i); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + if (desc->action && desc->action->handler) { + struct irq_chip *chip; + + seq_printf(m, "%5d ", i); + seq_printf(m, "0x%05lx ", desc->irq_data.hwirq); + + chip = irq_desc_get_chip(desc); + if (chip && chip->name) + p = chip->name; + else + p = none; + seq_printf(m, "%-15s ", p); + + data = irq_desc_get_chip_data(desc); + seq_printf(m, "0x%16p ", data); + + if (desc->irq_data.domain->of_node) + p = desc->irq_data.domain->of_node->full_name; + else + p = none; + seq_printf(m, "%s\n", p); + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } + + return 0; +} + +static int virq_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, virq_debug_show, inode->i_private); +} + +static const struct file_operations virq_debug_fops = { + .open = virq_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init irq_debugfs_init(void) +{ + if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root, + NULL, &virq_debug_fops) == NULL) + return -ENOMEM; + + return 0; +} +__initcall(irq_debugfs_init); +#endif /* CONFIG_VIRQ_DEBUG */ + +#else /* CONFIG_PPC */ + /** * irq_domain_add() - Register an irq_domain * @domain: ptr to initialized irq_domain structure @@ -185,3 +783,5 @@ struct irq_domain_ops irq_domain_simple_ops = { #endif /* CONFIG_OF_IRQ */ }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#endif /* !CONFIG_PPC */ -- cgit v1.2.3 From 03848373ea741caafab952fb62405ed7fc0c279c Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:52 -0700 Subject: irq_domain: remove NO_IRQ from irq domain code zero always means no irq when using irq domains. Get rid of the NO_IRQ references. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index f551bc1d3167..8f7b91ce53c4 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -108,7 +108,7 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, case IRQ_DOMAIN_MAP_LINEAR: rmap = (unsigned int *)(host + 1); for (i = 0; i < revmap_arg; i++) - rmap[i] = NO_IRQ; + rmap[i] = 0; host->revmap_data.linear.size = revmap_arg; host->revmap_data.linear.revmap = rmap; break; @@ -218,9 +218,9 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); virq = irq_alloc_desc_from(1, 0); - if (virq == NO_IRQ) { + if (!virq) { pr_debug("irq: create_direct virq allocation failed\n"); - return NO_IRQ; + return 0; } if (virq >= irq_virq_count) { pr_err("ERROR: no free irqs available below %i maximum\n", @@ -233,7 +233,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) if (irq_setup_virq(host, virq, virq)) { irq_free_desc(virq); - return NO_IRQ; + return 0; } return virq; @@ -263,13 +263,13 @@ unsigned int irq_create_mapping(struct irq_domain *host, printk(KERN_WARNING "irq_create_mapping called for" " NULL host, hwirq=%lx\n", hwirq); WARN_ON(1); - return NO_IRQ; + return 0; } pr_debug("irq: -> using host @%p\n", host); /* Check if mapping already exists */ virq = irq_find_mapping(host, hwirq); - if (virq != NO_IRQ) { + if (virq) { pr_debug("irq: -> existing mapping on virq %d\n", virq); return virq; } @@ -279,7 +279,7 @@ unsigned int irq_create_mapping(struct irq_domain *host, /* Handle legacy */ virq = (unsigned int)hwirq; if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) - return NO_IRQ; + return 0; return virq; } else { /* Allocate a virtual interrupt number */ @@ -289,16 +289,16 @@ unsigned int irq_create_mapping(struct irq_domain *host, virq = irq_alloc_desc_from(hint, 0); if (!virq) virq = irq_alloc_desc_from(1, 0); - if (virq == NO_IRQ) { + if (!virq) { pr_debug("irq: -> virq allocation failed\n"); - return NO_IRQ; + return 0; } } if (irq_setup_virq(host, virq, hwirq)) { if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) irq_free_desc(virq); - return NO_IRQ; + return 0; } pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", @@ -323,7 +323,7 @@ unsigned int irq_create_of_mapping(struct device_node *controller, if (host == NULL) { printk(KERN_WARNING "irq: no irq host found for %s !\n", controller->full_name); - return NO_IRQ; + return 0; } /* If host has no translation, then we assume interrupt line */ @@ -332,12 +332,12 @@ unsigned int irq_create_of_mapping(struct device_node *controller, else { if (host->ops->xlate(host, controller, intspec, intsize, &hwirq, &type)) - return NO_IRQ; + return 0; } /* Create mapping */ virq = irq_create_mapping(host, hwirq); - if (virq == NO_IRQ) + if (!virq) return virq; /* Set type if specified and different than the current one */ @@ -358,7 +358,7 @@ void irq_dispose_mapping(unsigned int virq) struct irq_domain *host; irq_hw_number_t hwirq; - if (virq == NO_IRQ || !irq_data) + if (!virq || !irq_data) return; host = irq_data->domain; @@ -387,7 +387,7 @@ void irq_dispose_mapping(unsigned int virq) switch(host->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: if (hwirq < host->revmap_data.linear.size) - host->revmap_data.linear.revmap[hwirq] = NO_IRQ; + host->revmap_data.linear.revmap[hwirq] = 0; break; case IRQ_DOMAIN_MAP_TREE: mutex_lock(&revmap_trees_mutex); @@ -422,7 +422,7 @@ unsigned int irq_find_mapping(struct irq_domain *host, if (host == NULL) host = irq_default_host; if (host == NULL) - return NO_IRQ; + return 0; /* legacy -> bail early */ if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) @@ -440,7 +440,7 @@ unsigned int irq_find_mapping(struct irq_domain *host, if (i >= irq_virq_count) i = 1; } while(i != hint); - return NO_IRQ; + return 0; } EXPORT_SYMBOL_GPL(irq_find_mapping); @@ -493,7 +493,7 @@ void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) return; - if (virq != NO_IRQ) { + if (virq) { mutex_lock(&revmap_trees_mutex); radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); @@ -527,7 +527,7 @@ unsigned int irq_linear_revmap(struct irq_domain *host, return irq_find_mapping(host, hwirq); /* Fill up revmap with slow path if no mapping found */ - if (unlikely(revmap[hwirq] == NO_IRQ)) + if (unlikely(!revmap[hwirq])) revmap[hwirq] = irq_find_mapping(host, hwirq); return revmap[hwirq]; -- cgit v1.2.3 From 68700650e71b6bb6636673f4f9c8ec807353d8d6 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:53 -0700 Subject: irq_domain: Remove references to old irq_host names No functional changes. Replaces non-exported references to 'host' with domain. Does not change any symbol names referenced by other .c files. Signed-off-by: Grant Likely Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- kernel/irq/irqdomain.c | 219 ++++++++++++++++++++++++------------------------- 1 file changed, 108 insertions(+), 111 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 8f7b91ce53c4..432d292b33f8 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -19,11 +19,11 @@ static DEFINE_MUTEX(irq_domain_mutex); #ifdef CONFIG_PPC static DEFINE_MUTEX(revmap_trees_mutex); static unsigned int irq_virq_count = NR_IRQS; -static struct irq_domain *irq_default_host; +static struct irq_domain *irq_default_domain; -static int default_irq_host_match(struct irq_domain *h, struct device_node *np) +static int default_irq_domain_match(struct irq_domain *d, struct device_node *np) { - return h->of_node != NULL && h->of_node == np; + return d->of_node != NULL && d->of_node == np; } /** @@ -31,8 +31,8 @@ static int default_irq_host_match(struct irq_domain *h, struct device_node *np) * @of_node: optional device-tree node of the interrupt controller * @revmap_type: type of reverse mapping to use * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map - * @ops: map/unmap host callbacks - * @inval_irq: provide a hw number in that host space that is always invalid + * @ops: map/unmap domain callbacks + * @inval_irq: provide a hw number in that domain space that is always invalid * * Allocates and initialize and irq_domain structure. Note that in the case of * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns @@ -48,7 +48,7 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, struct irq_domain_ops *ops, irq_hw_number_t inval_irq) { - struct irq_domain *host, *h; + struct irq_domain *domain, *h; unsigned int size = sizeof(struct irq_domain); unsigned int i; unsigned int *rmap; @@ -56,18 +56,18 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, /* Allocate structure and revmap table if using linear mapping */ if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) size += revmap_arg * sizeof(unsigned int); - host = kzalloc(size, GFP_KERNEL); - if (host == NULL) + domain = kzalloc(size, GFP_KERNEL); + if (domain == NULL) return NULL; /* Fill structure */ - host->revmap_type = revmap_type; - host->inval_irq = inval_irq; - host->ops = ops; - host->of_node = of_node_get(of_node); + domain->revmap_type = revmap_type; + domain->inval_irq = inval_irq; + domain->ops = ops; + domain->of_node = of_node_get(of_node); - if (host->ops->match == NULL) - host->ops->match = default_irq_host_match; + if (domain->ops->match == NULL) + domain->ops->match = default_irq_domain_match; mutex_lock(&irq_domain_mutex); /* Make sure only one legacy controller can be created */ @@ -75,53 +75,53 @@ struct irq_domain *irq_alloc_host(struct device_node *of_node, list_for_each_entry(h, &irq_domain_list, link) { if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { mutex_unlock(&irq_domain_mutex); - of_node_put(host->of_node); - kfree(host); + of_node_put(domain->of_node); + kfree(domain); return NULL; } } } - list_add(&host->link, &irq_domain_list); + list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); /* Additional setups per revmap type */ switch(revmap_type) { case IRQ_DOMAIN_MAP_LEGACY: /* 0 is always the invalid number for legacy */ - host->inval_irq = 0; - /* setup us as the host for all legacy interrupts */ + domain->inval_irq = 0; + /* setup us as the domain for all legacy interrupts */ for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { struct irq_data *irq_data = irq_get_irq_data(i); irq_data->hwirq = i; - irq_data->domain = host; + irq_data->domain = domain; /* Legacy flags are left to default at this point, * one can then use irq_create_mapping() to * explicitly change them */ - ops->map(host, i, i); + ops->map(domain, i, i); /* Clear norequest flags */ irq_clear_status_flags(i, IRQ_NOREQUEST); } break; case IRQ_DOMAIN_MAP_LINEAR: - rmap = (unsigned int *)(host + 1); + rmap = (unsigned int *)(domain + 1); for (i = 0; i < revmap_arg; i++) rmap[i] = 0; - host->revmap_data.linear.size = revmap_arg; - host->revmap_data.linear.revmap = rmap; + domain->revmap_data.linear.size = revmap_arg; + domain->revmap_data.linear.revmap = rmap; break; case IRQ_DOMAIN_MAP_TREE: - INIT_RADIX_TREE(&host->revmap_data.tree, GFP_KERNEL); + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); break; default: break; } - pr_debug("irq: Allocated host of type %d @0x%p\n", revmap_type, host); + pr_debug("irq: Allocated domain of type %d @0x%p\n", revmap_type, domain); - return host; + return domain; } /** @@ -150,18 +150,18 @@ EXPORT_SYMBOL_GPL(irq_find_host); /** * irq_set_default_host() - Set a "default" irq domain - * @host: default host pointer + * @domain: default domain pointer * * For convenience, it's possible to set a "default" domain that will be used * whenever NULL is passed to irq_create_mapping(). It makes life easier for * platforms that want to manipulate a few hard coded interrupt numbers that * aren't properly represented in the device-tree. */ -void irq_set_default_host(struct irq_domain *host) +void irq_set_default_host(struct irq_domain *domain) { - pr_debug("irq: Default host set to @0x%p\n", host); + pr_debug("irq: Default domain set to @0x%p\n", domain); - irq_default_host = host; + irq_default_domain = domain; } /** @@ -180,14 +180,14 @@ void irq_set_virq_count(unsigned int count) irq_virq_count = count; } -static int irq_setup_virq(struct irq_domain *host, unsigned int virq, +static int irq_setup_virq(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); irq_data->hwirq = hwirq; - irq_data->domain = host; - if (host->ops->map(host, virq, hwirq)) { + irq_data->domain = domain; + if (domain->ops->map(domain, virq, hwirq)) { pr_debug("irq: -> mapping failed, freeing\n"); irq_data->domain = NULL; irq_data->hwirq = 0; @@ -201,21 +201,21 @@ static int irq_setup_virq(struct irq_domain *host, unsigned int virq, /** * irq_create_direct_mapping() - Allocate an irq for direct mapping - * @host: domain to allocate the irq for or NULL for default host + * @domain: domain to allocate the irq for or NULL for default domain * * This routine is used for irq controllers which can choose the hardware * interrupt numbers they generate. In such a case it's simplest to use * the linux irq as the hardware interrupt number. */ -unsigned int irq_create_direct_mapping(struct irq_domain *host) +unsigned int irq_create_direct_mapping(struct irq_domain *domain) { unsigned int virq; - if (host == NULL) - host = irq_default_host; + if (domain == NULL) + domain = irq_default_domain; - BUG_ON(host == NULL); - WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_NOMAP); + BUG_ON(domain == NULL); + WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_NOMAP); virq = irq_alloc_desc_from(1, 0); if (!virq) { @@ -231,7 +231,7 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) pr_debug("irq: create_direct obtained virq %d\n", virq); - if (irq_setup_virq(host, virq, virq)) { + if (irq_setup_virq(domain, virq, virq)) { irq_free_desc(virq); return 0; } @@ -241,41 +241,41 @@ unsigned int irq_create_direct_mapping(struct irq_domain *host) /** * irq_create_mapping() - Map a hardware interrupt into linux irq space - * @host: host owning this hardware interrupt or NULL for default host - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt or NULL for default domain + * @hwirq: hardware irq number in that domain space * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ -unsigned int irq_create_mapping(struct irq_domain *host, +unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int virq, hint; - pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", host, hwirq); + pr_debug("irq: irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); - /* Look for default host if nececssary */ - if (host == NULL) - host = irq_default_host; - if (host == NULL) { + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) { printk(KERN_WARNING "irq_create_mapping called for" - " NULL host, hwirq=%lx\n", hwirq); + " NULL domain, hwirq=%lx\n", hwirq); WARN_ON(1); return 0; } - pr_debug("irq: -> using host @%p\n", host); + pr_debug("irq: -> using domain @%p\n", domain); /* Check if mapping already exists */ - virq = irq_find_mapping(host, hwirq); + virq = irq_find_mapping(domain, hwirq); if (virq) { pr_debug("irq: -> existing mapping on virq %d\n", virq); return virq; } /* Get a virtual interrupt number */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { /* Handle legacy */ virq = (unsigned int)hwirq; if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) @@ -295,14 +295,14 @@ unsigned int irq_create_mapping(struct irq_domain *host, } } - if (irq_setup_virq(host, virq, hwirq)) { - if (host->revmap_type != IRQ_DOMAIN_MAP_LEGACY) + if (irq_setup_virq(domain, virq, hwirq)) { + if (domain->revmap_type != IRQ_DOMAIN_MAP_LEGACY) irq_free_desc(virq); return 0; } - pr_debug("irq: irq %lu on host %s mapped to virtual irq %u\n", - hwirq, host->of_node ? host->of_node->full_name : "null", virq); + pr_debug("irq: irq %lu on domain %s mapped to virtual irq %u\n", + hwirq, domain->of_node ? domain->of_node->full_name : "null", virq); return virq; } @@ -311,32 +311,29 @@ EXPORT_SYMBOL_GPL(irq_create_mapping); unsigned int irq_create_of_mapping(struct device_node *controller, const u32 *intspec, unsigned int intsize) { - struct irq_domain *host; + struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; unsigned int virq; - if (controller == NULL) - host = irq_default_host; - else - host = irq_find_host(controller); - if (host == NULL) { - printk(KERN_WARNING "irq: no irq host found for %s !\n", + domain = controller ? irq_find_host(controller) : irq_default_domain; + if (!domain) { + printk(KERN_WARNING "irq: no irq domain found for %s !\n", controller->full_name); return 0; } - /* If host has no translation, then we assume interrupt line */ - if (host->ops->xlate == NULL) + /* If domain has no translation, then we assume interrupt line */ + if (domain->ops->xlate == NULL) hwirq = intspec[0]; else { - if (host->ops->xlate(host, controller, intspec, intsize, + if (domain->ops->xlate(domain, controller, intspec, intsize, &hwirq, &type)) return 0; } /* Create mapping */ - virq = irq_create_mapping(host, hwirq); + virq = irq_create_mapping(domain, hwirq); if (!virq) return virq; @@ -355,18 +352,18 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping); void irq_dispose_mapping(unsigned int virq) { struct irq_data *irq_data = irq_get_irq_data(virq); - struct irq_domain *host; + struct irq_domain *domain; irq_hw_number_t hwirq; if (!virq || !irq_data) return; - host = irq_data->domain; - if (WARN_ON(host == NULL)) + domain = irq_data->domain; + if (WARN_ON(domain == NULL)) return; /* Never unmap legacy interrupts */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) return; irq_set_status_flags(virq, IRQ_NOREQUEST); @@ -378,26 +375,26 @@ void irq_dispose_mapping(unsigned int virq) synchronize_irq(virq); /* Tell the PIC about it */ - if (host->ops->unmap) - host->ops->unmap(host, virq); + if (domain->ops->unmap) + domain->ops->unmap(domain, virq); smp_mb(); /* Clear reverse map */ hwirq = irq_data->hwirq; - switch(host->revmap_type) { + switch(domain->revmap_type) { case IRQ_DOMAIN_MAP_LINEAR: - if (hwirq < host->revmap_data.linear.size) - host->revmap_data.linear.revmap[hwirq] = 0; + if (hwirq < domain->revmap_data.linear.size) + domain->revmap_data.linear.revmap[hwirq] = 0; break; case IRQ_DOMAIN_MAP_TREE: mutex_lock(&revmap_trees_mutex); - radix_tree_delete(&host->revmap_data.tree, hwirq); + radix_tree_delete(&domain->revmap_data.tree, hwirq); mutex_unlock(&revmap_trees_mutex); break; } /* Destroy map */ - irq_data->hwirq = host->inval_irq; + irq_data->hwirq = domain->inval_irq; irq_free_desc(virq); } @@ -405,27 +402,27 @@ EXPORT_SYMBOL_GPL(irq_dispose_mapping); /** * irq_find_mapping() - Find a linux irq from an hw irq number. - * @host: domain owning this hardware interrupt - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * This is a slow path, for use by generic code. It's expected that an * irq controller implementation directly calls the appropriate low level * mapping function. */ -unsigned int irq_find_mapping(struct irq_domain *host, +unsigned int irq_find_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int i; unsigned int hint = hwirq % irq_virq_count; - /* Look for default host if nececssary */ - if (host == NULL) - host = irq_default_host; - if (host == NULL) + /* Look for default domain if nececssary */ + if (domain == NULL) + domain = irq_default_domain; + if (domain == NULL) return 0; /* legacy -> bail early */ - if (host->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) return hwirq; /* Slow path does a linear search of the map */ @@ -434,7 +431,7 @@ unsigned int irq_find_mapping(struct irq_domain *host, i = hint; do { struct irq_data *data = irq_get_irq_data(i); - if (data && (data->domain == host) && (data->hwirq == hwirq)) + if (data && (data->domain == domain) && (data->hwirq == hwirq)) return i; i++; if (i >= irq_virq_count) @@ -446,26 +443,26 @@ EXPORT_SYMBOL_GPL(irq_find_mapping); /** * irq_radix_revmap_lookup() - Find a linux irq from a hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * This is a fast path, for use by irq controller code that uses radix tree * revmaps */ -unsigned int irq_radix_revmap_lookup(struct irq_domain *host, +unsigned int irq_radix_revmap_lookup(struct irq_domain *domain, irq_hw_number_t hwirq) { struct irq_data *irq_data; - if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) - return irq_find_mapping(host, hwirq); + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) + return irq_find_mapping(domain, hwirq); /* * Freeing an irq can delete nodes along the path to * do the lookup via call_rcu. */ rcu_read_lock(); - irq_data = radix_tree_lookup(&host->revmap_data.tree, hwirq); + irq_data = radix_tree_lookup(&domain->revmap_data.tree, hwirq); rcu_read_unlock(); /* @@ -473,62 +470,62 @@ unsigned int irq_radix_revmap_lookup(struct irq_domain *host, * Else fallback to linear lookup - this should not happen in practice * as it means that we failed to insert the node in the radix tree. */ - return irq_data ? irq_data->irq : irq_find_mapping(host, hwirq); + return irq_data ? irq_data->irq : irq_find_mapping(domain, hwirq); } /** * irq_radix_revmap_insert() - Insert a hw irq to linux irq number mapping. - * @host: host owning this hardware interrupt + * @domain: domain owning this hardware interrupt * @virq: linux irq number - * @hwirq: hardware irq number in that host space + * @hwirq: hardware irq number in that domain space * * This is for use by irq controllers that use a radix tree reverse * mapping for fast lookup. */ -void irq_radix_revmap_insert(struct irq_domain *host, unsigned int virq, +void irq_radix_revmap_insert(struct irq_domain *domain, unsigned int virq, irq_hw_number_t hwirq) { struct irq_data *irq_data = irq_get_irq_data(virq); - if (WARN_ON(host->revmap_type != IRQ_DOMAIN_MAP_TREE)) + if (WARN_ON(domain->revmap_type != IRQ_DOMAIN_MAP_TREE)) return; if (virq) { mutex_lock(&revmap_trees_mutex); - radix_tree_insert(&host->revmap_data.tree, hwirq, irq_data); + radix_tree_insert(&domain->revmap_data.tree, hwirq, irq_data); mutex_unlock(&revmap_trees_mutex); } } /** * irq_linear_revmap() - Find a linux irq from a hw irq number. - * @host: host owning this hardware interrupt - * @hwirq: hardware irq number in that host space + * @domain: domain owning this hardware interrupt + * @hwirq: hardware irq number in that domain space * * This is a fast path, for use by irq controller code that uses linear * revmaps. It does fallback to the slow path if the revmap doesn't exist * yet and will create the revmap entry with appropriate locking */ -unsigned int irq_linear_revmap(struct irq_domain *host, +unsigned int irq_linear_revmap(struct irq_domain *domain, irq_hw_number_t hwirq) { unsigned int *revmap; - if (WARN_ON_ONCE(host->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) - return irq_find_mapping(host, hwirq); + if (WARN_ON_ONCE(domain->revmap_type != IRQ_DOMAIN_MAP_LINEAR)) + return irq_find_mapping(domain, hwirq); /* Check revmap bounds */ - if (unlikely(hwirq >= host->revmap_data.linear.size)) - return irq_find_mapping(host, hwirq); + if (unlikely(hwirq >= domain->revmap_data.linear.size)) + return irq_find_mapping(domain, hwirq); /* Check if revmap was allocated */ - revmap = host->revmap_data.linear.revmap; + revmap = domain->revmap_data.linear.revmap; if (unlikely(revmap == NULL)) - return irq_find_mapping(host, hwirq); + return irq_find_mapping(domain, hwirq); /* Fill up revmap with slow path if no mapping found */ if (unlikely(!revmap[hwirq])) - revmap[hwirq] = irq_find_mapping(host, hwirq); + revmap[hwirq] = irq_find_mapping(domain, hwirq); return revmap[hwirq]; } @@ -544,7 +541,7 @@ static int virq_debug_show(struct seq_file *m, void *private) int i; seq_printf(m, "%-5s %-7s %-15s %-18s %s\n", "virq", "hwirq", - "chip name", "chip data", "host name"); + "chip name", "chip data", "domain name"); for (i = 1; i < nr_irqs; i++) { desc = irq_to_desc(i); -- cgit v1.2.3 From a8db8cf0d894df5f1dcfd4bce9894e0dbcc01c96 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:54 -0700 Subject: irq_domain: Replace irq_alloc_host() with revmap-specific initializers Each revmap type has different arguments for setting up the revmap. This patch splits up the generator functions so that each revmap type can do its own setup and the user doesn't need to keep track of how each revmap type handles the arguments. This patch also adds a host_data argument to the generators. There are cases where the host_data pointer will be needed before the function returns. ie. the legacy map calls the .map callback for each irq before returning. v2: - Add void *host_data argument to irq_domain_add_*() functions - fixed failure to compile - Moved IRQ_DOMAIN_MAP_* defines into irqdomain.c Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- arch/powerpc/platforms/512x/mpc5121_ads_cpld.c | 3 +- arch/powerpc/platforms/52xx/media5200.c | 7 +- arch/powerpc/platforms/52xx/mpc52xx_gpt.c | 6 +- arch/powerpc/platforms/52xx/mpc52xx_pic.c | 4 +- arch/powerpc/platforms/82xx/pq2ads-pci-pic.c | 6 +- arch/powerpc/platforms/85xx/socrates_fpga_pic.c | 5 +- arch/powerpc/platforms/86xx/gef_pic.c | 5 +- arch/powerpc/platforms/cell/axon_msi.c | 5 +- arch/powerpc/platforms/cell/beat_interrupt.c | 4 +- arch/powerpc/platforms/cell/interrupt.c | 4 +- arch/powerpc/platforms/cell/spider-pic.c | 6 +- arch/powerpc/platforms/embedded6xx/flipper-pic.c | 6 +- arch/powerpc/platforms/embedded6xx/hlwd-pic.c | 5 +- arch/powerpc/platforms/iseries/irq.c | 3 +- arch/powerpc/platforms/powermac/pic.c | 5 +- arch/powerpc/platforms/powermac/smp.c | 3 +- arch/powerpc/platforms/ps3/interrupt.c | 3 +- arch/powerpc/platforms/wsp/opb_pic.c | 7 +- arch/powerpc/sysdev/cpm1.c | 3 +- arch/powerpc/sysdev/cpm2_pic.c | 3 +- arch/powerpc/sysdev/ehv_pic.c | 6 +- arch/powerpc/sysdev/fsl_msi.c | 6 +- arch/powerpc/sysdev/i8259.c | 3 +- arch/powerpc/sysdev/ipic.c | 7 +- arch/powerpc/sysdev/mpc8xx_pic.c | 3 +- arch/powerpc/sysdev/mpic.c | 7 +- arch/powerpc/sysdev/mv64x60_pic.c | 5 +- arch/powerpc/sysdev/qe_lib/qe_ic.c | 5 +- arch/powerpc/sysdev/tsi108_pci.c | 3 +- arch/powerpc/sysdev/uic.c | 6 +- arch/powerpc/sysdev/xics/xics-common.c | 3 +- arch/powerpc/sysdev/xilinx_intc.c | 5 +- drivers/gpio/gpio-mpc8xxx.c | 7 +- include/linux/irqdomain.h | 24 ++- kernel/irq/irqdomain.c | 200 +++++++++++++++-------- 35 files changed, 198 insertions(+), 185 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c index fefa7977fa9f..291d61c94718 100644 --- a/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c +++ b/arch/powerpc/platforms/512x/mpc5121_ads_cpld.c @@ -190,8 +190,7 @@ mpc5121_ads_cpld_pic_init(void) cpld_pic_node = of_node_get(np); - cpld_pic_host = - irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, 16, &cpld_pic_host_ops, 16); + cpld_pic_host = irq_domain_add_linear(np, 16, &cpld_pic_host_ops, NULL); if (!cpld_pic_host) { printk(KERN_ERR "CPLD PIC: failed to allocate irq host!\n"); goto end; diff --git a/arch/powerpc/platforms/52xx/media5200.c b/arch/powerpc/platforms/52xx/media5200.c index a746415c4242..5db5cfb6a4ff 100644 --- a/arch/powerpc/platforms/52xx/media5200.c +++ b/arch/powerpc/platforms/52xx/media5200.c @@ -173,15 +173,12 @@ static void __init media5200_init_irq(void) spin_lock_init(&media5200_irq.lock); - media5200_irq.irqhost = irq_alloc_host(fpga_np, IRQ_DOMAIN_MAP_LINEAR, - MEDIA5200_NUM_IRQS, - &media5200_irq_ops, -1); + media5200_irq.irqhost = irq_domain_add_linear(fpga_np, + MEDIA5200_NUM_IRQS, &media5200_irq_ops, &media5200_irq); if (!media5200_irq.irqhost) goto out; pr_debug("%s: allocated irqhost\n", __func__); - media5200_irq.irqhost->host_data = &media5200_irq; - irq_set_handler_data(cascade_virq, &media5200_irq); irq_set_chained_handler(cascade_virq, media5200_irq_cascade); diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index e90af8fd8413..b53275d12727 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -252,14 +252,12 @@ mpc52xx_gpt_irq_setup(struct mpc52xx_gpt_priv *gpt, struct device_node *node) if (!cascade_virq) return; - gpt->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR, 1, - &mpc52xx_gpt_irq_ops, -1); + gpt->irqhost = irq_domain_add_linear(node, 1, &mpc52xx_gpt_irq_ops, gpt); if (!gpt->irqhost) { - dev_err(gpt->dev, "irq_alloc_host() failed\n"); + dev_err(gpt->dev, "irq_domain_add_linear() failed\n"); return; } - gpt->irqhost->host_data = gpt; irq_set_handler_data(cascade_virq, gpt); irq_set_chained_handler(cascade_virq, mpc52xx_gpt_irq_cascade); diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pic.c b/arch/powerpc/platforms/52xx/mpc52xx_pic.c index 8c997f1a9122..41fa67126c44 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_pic.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_pic.c @@ -444,9 +444,9 @@ void __init mpc52xx_init_irq(void) * As last step, add an irq host to translate the real * hw irq information provided by the ofw to linux virq */ - mpc52xx_irqhost = irq_alloc_host(picnode, IRQ_DOMAIN_MAP_LINEAR, + mpc52xx_irqhost = irq_domain_add_linear(picnode, MPC52xx_IRQ_HIGHTESTHWIRQ, - &mpc52xx_irqhost_ops, -1); + &mpc52xx_irqhost_ops, NULL); if (!mpc52xx_irqhost) panic(__FILE__ ": Cannot allocate the IRQ host\n"); diff --git a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c index bdba174e7b3a..4ef9d6918e23 100644 --- a/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c +++ b/arch/powerpc/platforms/82xx/pq2ads-pci-pic.c @@ -156,17 +156,13 @@ int __init pq2ads_pci_init_irq(void) out_be32(&priv->regs->mask, ~0); mb(); - host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, NUM_IRQS, - &pci_pic_host_ops, NUM_IRQS); + host = irq_domain_add_linear(np, NUM_IRQS, &pci_pic_host_ops, priv); if (!host) { ret = -ENOMEM; goto out_unmap_regs; } - host->host_data = priv; - priv->host = host; - host->host_data = priv; irq_set_handler_data(irq, priv); irq_set_chained_handler(irq, pq2ads_pci_irq_demux); diff --git a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c index e3ef7c9ed7b1..1092c121adf3 100644 --- a/arch/powerpc/platforms/85xx/socrates_fpga_pic.c +++ b/arch/powerpc/platforms/85xx/socrates_fpga_pic.c @@ -280,9 +280,8 @@ void socrates_fpga_pic_init(struct device_node *pic) int i; /* Setup an irq_domain structure */ - socrates_fpga_pic_irq_host = irq_alloc_host(pic, IRQ_DOMAIN_MAP_LINEAR, - SOCRATES_FPGA_NUM_IRQS, &socrates_fpga_pic_host_ops, - SOCRATES_FPGA_NUM_IRQS); + socrates_fpga_pic_irq_host = irq_domain_add_linear(pic, + SOCRATES_FPGA_NUM_IRQS, &socrates_fpga_pic_host_ops, NULL); if (socrates_fpga_pic_irq_host == NULL) { pr_err("FPGA PIC: Unable to allocate host\n"); return; diff --git a/arch/powerpc/platforms/86xx/gef_pic.c b/arch/powerpc/platforms/86xx/gef_pic.c index 0cf8af230bcf..126a94b530e4 100644 --- a/arch/powerpc/platforms/86xx/gef_pic.c +++ b/arch/powerpc/platforms/86xx/gef_pic.c @@ -212,9 +212,8 @@ void __init gef_pic_init(struct device_node *np) } /* Setup an irq_domain structure */ - gef_pic_irq_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, - GEF_PIC_NUM_IRQS, - &gef_pic_host_ops, NO_IRQ); + gef_pic_irq_host = irq_domain_add_linear(np, GEF_PIC_NUM_IRQS, + &gef_pic_host_ops, NULL); if (gef_pic_irq_host == NULL) return; diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c index 1bfd18a48a7f..cf9fd3c8a9b9 100644 --- a/arch/powerpc/platforms/cell/axon_msi.c +++ b/arch/powerpc/platforms/cell/axon_msi.c @@ -392,16 +392,13 @@ static int axon_msi_probe(struct platform_device *device) } memset(msic->fifo_virt, 0xff, MSIC_FIFO_SIZE_BYTES); - msic->irq_domain = irq_alloc_host(dn, IRQ_DOMAIN_MAP_NOMAP, - NR_IRQS, &msic_host_ops, 0); + msic->irq_domain = irq_domain_add_nomap(dn, &msic_host_ops, msic); if (!msic->irq_domain) { printk(KERN_ERR "axon_msi: couldn't allocate irq_domain for %s\n", dn->full_name); goto out_free_fifo; } - msic->irq_domain->host_data = msic; - irq_set_handler_data(virq, msic); irq_set_chained_handler(virq, axon_msi_cascade); pr_devel("axon_msi: irq 0x%x setup for axon_msi\n", virq); diff --git a/arch/powerpc/platforms/cell/beat_interrupt.c b/arch/powerpc/platforms/cell/beat_interrupt.c index 21b64cfef5e5..bbdf57440cd5 100644 --- a/arch/powerpc/platforms/cell/beat_interrupt.c +++ b/arch/powerpc/platforms/cell/beat_interrupt.c @@ -239,9 +239,7 @@ void __init beatic_init_IRQ(void) ppc_md.get_irq = beatic_get_irq; /* Allocate an irq host */ - beatic_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0, - &beatic_pic_host_ops, - 0); + beatic_host = irq_domain_add_nomap(NULL, &beatic_pic_host_ops, NULL); BUG_ON(beatic_host == NULL); irq_set_default_host(beatic_host); } diff --git a/arch/powerpc/platforms/cell/interrupt.c b/arch/powerpc/platforms/cell/interrupt.c index 6888475e7c62..c844797a6898 100644 --- a/arch/powerpc/platforms/cell/interrupt.c +++ b/arch/powerpc/platforms/cell/interrupt.c @@ -378,8 +378,8 @@ static int __init setup_iic(void) void __init iic_init_IRQ(void) { /* Setup an irq host data structure */ - iic_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_LINEAR, IIC_SOURCE_COUNT, - &iic_host_ops, IIC_IRQ_INVALID); + iic_host = irq_domain_add_linear(NULL, IIC_SOURCE_COUNT, &iic_host_ops, + NULL); BUG_ON(iic_host == NULL); irq_set_default_host(iic_host); diff --git a/arch/powerpc/platforms/cell/spider-pic.c b/arch/powerpc/platforms/cell/spider-pic.c index 1f935a772ef8..6521d202284a 100644 --- a/arch/powerpc/platforms/cell/spider-pic.c +++ b/arch/powerpc/platforms/cell/spider-pic.c @@ -299,12 +299,10 @@ static void __init spider_init_one(struct device_node *of_node, int chip, panic("spider_pic: can't map registers !"); /* Allocate a host */ - pic->host = irq_alloc_host(of_node, IRQ_DOMAIN_MAP_LINEAR, - SPIDER_SRC_COUNT, &spider_host_ops, - SPIDER_IRQ_INVALID); + pic->host = irq_domain_add_linear(of_node, SPIDER_SRC_COUNT, + &spider_host_ops, pic); if (pic->host == NULL) panic("spider_pic: can't allocate irq host !"); - pic->host->host_data = pic; /* Go through all sources and disable them */ for (i = 0; i < SPIDER_SRC_COUNT; i++) { diff --git a/arch/powerpc/platforms/embedded6xx/flipper-pic.c b/arch/powerpc/platforms/embedded6xx/flipper-pic.c index f862361730fb..434597166ca4 100644 --- a/arch/powerpc/platforms/embedded6xx/flipper-pic.c +++ b/arch/powerpc/platforms/embedded6xx/flipper-pic.c @@ -159,15 +159,13 @@ struct irq_domain * __init flipper_pic_init(struct device_node *np) __flipper_quiesce(io_base); - irq_domain = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, FLIPPER_NR_IRQS, - &flipper_irq_domain_ops, -1); + irq_domain = irq_domain_add_linear(np, FLIPPER_NR_IRQS, + &flipper_irq_domain_ops, io_base); if (!irq_domain) { pr_err("failed to allocate irq_domain\n"); return NULL; } - irq_domain->host_data = io_base; - out: return irq_domain; } diff --git a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c index 2d4a5d48fbbd..499d410b95fa 100644 --- a/arch/powerpc/platforms/embedded6xx/hlwd-pic.c +++ b/arch/powerpc/platforms/embedded6xx/hlwd-pic.c @@ -177,13 +177,12 @@ struct irq_domain *hlwd_pic_init(struct device_node *np) __hlwd_quiesce(io_base); - irq_domain = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, HLWD_NR_IRQS, - &hlwd_irq_domain_ops, -1); + irq_domain = irq_domain_add_linear(np, HLWD_NR_IRQS, + &hlwd_irq_domain_ops, io_base); if (!irq_domain) { pr_err("failed to allocate irq_domain\n"); return NULL; } - irq_domain->host_data = io_base; return irq_domain; } diff --git a/arch/powerpc/platforms/iseries/irq.c b/arch/powerpc/platforms/iseries/irq.c index b07d4f2e0155..5538b593079d 100644 --- a/arch/powerpc/platforms/iseries/irq.c +++ b/arch/powerpc/platforms/iseries/irq.c @@ -380,8 +380,7 @@ void __init iSeries_init_IRQ(void) /* Create irq host. No need for a revmap since HV will give us * back our virtual irq number */ - host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0, - &iseries_irq_domain_ops, 0); + host = irq_domain_add_nomap(NULL, &iseries_irq_domain_ops, NULL); BUG_ON(host == NULL); irq_set_default_host(host); diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index cff326ab8ef2..646fdf3b9c0c 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -352,9 +352,8 @@ static void __init pmac_pic_probe_oldstyle(void) /* * Allocate an irq host */ - pmac_pic_host = irq_alloc_host(master, IRQ_DOMAIN_MAP_LINEAR, max_irqs, - &pmac_pic_host_ops, - max_irqs); + pmac_pic_host = irq_domain_add_linear(master, max_irqs, + &pmac_pic_host_ops, NULL); BUG_ON(pmac_pic_host == NULL); irq_set_default_host(pmac_pic_host); diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 6b1ef2d4dea0..09afd704f07a 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -192,8 +192,7 @@ static int psurge_secondary_ipi_init(void) { int rc = -ENOMEM; - psurge_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0, - &psurge_host_ops, 0); + psurge_host = irq_domain_add_nomap(NULL, &psurge_host_ops, NULL); if (psurge_host) psurge_secondary_virq = irq_create_direct_mapping(psurge_host); diff --git a/arch/powerpc/platforms/ps3/interrupt.c b/arch/powerpc/platforms/ps3/interrupt.c index c5980e422dc6..c05808f21015 100644 --- a/arch/powerpc/platforms/ps3/interrupt.c +++ b/arch/powerpc/platforms/ps3/interrupt.c @@ -753,8 +753,7 @@ void __init ps3_init_IRQ(void) unsigned cpu; struct irq_domain *host; - host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_NOMAP, 0, &ps3_host_ops, - PS3_INVALID_OUTLET); + host = irq_domain_add_nomap(NULL, &ps3_host_ops, NULL); irq_set_default_host(host); irq_set_virq_count(PS3_PLUG_MAX + 1); diff --git a/arch/powerpc/platforms/wsp/opb_pic.c b/arch/powerpc/platforms/wsp/opb_pic.c index 76b33bc36116..4837515c2826 100644 --- a/arch/powerpc/platforms/wsp/opb_pic.c +++ b/arch/powerpc/platforms/wsp/opb_pic.c @@ -263,13 +263,11 @@ struct opb_pic *opb_pic_init_one(struct device_node *dn) goto free_opb; } - /* Allocate an irq host so that Linux knows that despite only + /* Allocate an irq domain so that Linux knows that despite only * having one interrupt to issue, we're the controller for multiple * hardware IRQs, so later we can lookup their virtual IRQs. */ - opb->host = irq_alloc_host(dn, IRQ_DOMAIN_MAP_LINEAR, - OPB_NR_IRQS, &opb_host_ops, -1); - + opb->host = irq_domain_add_linear(dn, OPB_NR_IRQS, &opb_host_ops, opb); if (!opb->host) { printk(KERN_ERR "opb: Failed to allocate IRQ host!\n"); goto free_regs; @@ -277,7 +275,6 @@ struct opb_pic *opb_pic_init_one(struct device_node *dn) opb->index = opb_index++; spin_lock_init(&opb->lock); - opb->host->host_data = opb; /* Disable all interrupts by default */ opb_out(opb, OPB_MLSASIER, 0); diff --git a/arch/powerpc/sysdev/cpm1.c b/arch/powerpc/sysdev/cpm1.c index 0877a757c209..53f39dbbfb97 100644 --- a/arch/powerpc/sysdev/cpm1.c +++ b/arch/powerpc/sysdev/cpm1.c @@ -164,8 +164,7 @@ unsigned int cpm_pic_init(void) out_be32(&cpic_reg->cpic_cimr, 0); - cpm_pic_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, - 64, &cpm_pic_host_ops, 64); + cpm_pic_host = irq_domain_add_linear(np, 64, &cpm_pic_host_ops, NULL); if (cpm_pic_host == NULL) { printk(KERN_ERR "CPM2 PIC: failed to allocate irq host!\n"); sirq = NO_IRQ; diff --git a/arch/powerpc/sysdev/cpm2_pic.c b/arch/powerpc/sysdev/cpm2_pic.c index b149baae5d33..b3643322c528 100644 --- a/arch/powerpc/sysdev/cpm2_pic.c +++ b/arch/powerpc/sysdev/cpm2_pic.c @@ -275,8 +275,7 @@ void cpm2_pic_init(struct device_node *node) out_be32(&cpm2_intctl->ic_scprrl, 0x05309770); /* create a legacy host */ - cpm2_pic_host = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR, - 64, &cpm2_pic_host_ops, 64); + cpm2_pic_host = irq_domain_add_linear(node, 64, &cpm2_pic_host_ops, NULL); if (cpm2_pic_host == NULL) { printk(KERN_ERR "CPM2 PIC: failed to allocate irq host!\n"); return; diff --git a/arch/powerpc/sysdev/ehv_pic.c b/arch/powerpc/sysdev/ehv_pic.c index 48d3ba1220d3..adea322ea18f 100644 --- a/arch/powerpc/sysdev/ehv_pic.c +++ b/arch/powerpc/sysdev/ehv_pic.c @@ -275,9 +275,8 @@ void __init ehv_pic_init(void) return; } - ehv_pic->irqhost = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, - NR_EHV_PIC_INTS, &ehv_pic_host_ops, 0); - + ehv_pic->irqhost = irq_domain_add_linear(np, NR_EHV_PIC_INTS, + &ehv_pic_host_ops, ehv_pic); if (!ehv_pic->irqhost) { of_node_put(np); kfree(ehv_pic); @@ -293,7 +292,6 @@ void __init ehv_pic_init(void) of_node_put(np2); } - ehv_pic->irqhost->host_data = ehv_pic; ehv_pic->hc_irq = ehv_pic_irq_chip; ehv_pic->hc_irq.irq_set_affinity = ehv_pic_set_affinity; ehv_pic->coreint_flag = coreint_flag; diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 3f9a301c4550..f4fd95bc1278 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -387,8 +387,8 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev) } platform_set_drvdata(dev, msi); - msi->irqhost = irq_alloc_host(dev->dev.of_node, IRQ_DOMAIN_MAP_LINEAR, - NR_MSI_IRQS, &fsl_msi_host_ops, 0); + msi->irqhost = irq_domain_add_linear(dev->dev.of_node, + NR_MSI_IRQS, &fsl_msi_host_ops, msi); if (msi->irqhost == NULL) { dev_err(&dev->dev, "No memory for MSI irqhost\n"); @@ -420,8 +420,6 @@ static int __devinit fsl_of_msi_probe(struct platform_device *dev) msi->feature = features->fsl_pic_ip; - msi->irqhost->host_data = msi; - /* * Remember the phandle, so that we can match with any PCI nodes * that have an "fsl,msi" property. diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index 7e67890b8fc2..573a73bd954a 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -263,8 +263,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr) raw_spin_unlock_irqrestore(&i8259_lock, flags); /* create a legacy host */ - i8259_host = irq_alloc_host(node, IRQ_DOMAIN_MAP_LEGACY, - 0, &i8259_host_ops, 0); + i8259_host = irq_domain_add_legacy(node, &i8259_host_ops, NULL); if (i8259_host == NULL) { printk(KERN_ERR "i8259: failed to allocate irq host !\n"); return; diff --git a/arch/powerpc/sysdev/ipic.c b/arch/powerpc/sysdev/ipic.c index 9abed3760545..0eaaa01c11b3 100644 --- a/arch/powerpc/sysdev/ipic.c +++ b/arch/powerpc/sysdev/ipic.c @@ -728,9 +728,8 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) if (ipic == NULL) return NULL; - ipic->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR, - NR_IPIC_INTS, - &ipic_host_ops, 0); + ipic->irqhost = irq_domain_add_linear(node, NR_IPIC_INTS, + &ipic_host_ops, ipic); if (ipic->irqhost == NULL) { kfree(ipic); return NULL; @@ -738,8 +737,6 @@ struct ipic * __init ipic_init(struct device_node *node, unsigned int flags) ipic->regs = ioremap(res.start, resource_size(&res)); - ipic->irqhost->host_data = ipic; - /* init hw */ ipic_write(ipic->regs, IPIC_SICNR, 0x0); diff --git a/arch/powerpc/sysdev/mpc8xx_pic.c b/arch/powerpc/sysdev/mpc8xx_pic.c index 978dfc4c3120..d5f5416be310 100644 --- a/arch/powerpc/sysdev/mpc8xx_pic.c +++ b/arch/powerpc/sysdev/mpc8xx_pic.c @@ -171,8 +171,7 @@ int mpc8xx_pic_init(void) goto out; } - mpc8xx_pic_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, - 64, &mpc8xx_pic_host_ops, 64); + mpc8xx_pic_host = irq_domain_add_linear(np, 64, &mpc8xx_pic_host_ops, NULL); if (mpc8xx_pic_host == NULL) { printk(KERN_ERR "MPC8xx PIC: failed to allocate irq host!\n"); ret = -ENOMEM; diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index c844d343bf32..c83a512fa175 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -1345,10 +1345,9 @@ struct mpic * __init mpic_alloc(struct device_node *node, mpic->isu_shift = 1 + __ilog2(mpic->isu_size - 1); mpic->isu_mask = (1 << mpic->isu_shift) - 1; - mpic->irqhost = irq_alloc_host(mpic->node, IRQ_DOMAIN_MAP_LINEAR, + mpic->irqhost = irq_domain_add_linear(mpic->node, isu_size ? isu_size : mpic->num_sources, - &mpic_host_ops, - flags & MPIC_LARGE_VECTORS ? 2048 : 256); + &mpic_host_ops, mpic); /* * FIXME: The code leaks the MPIC object and mappings here; this @@ -1357,8 +1356,6 @@ struct mpic * __init mpic_alloc(struct device_node *node, if (mpic->irqhost == NULL) return NULL; - mpic->irqhost->host_data = mpic; - /* Display version */ switch (greg_feature & MPIC_GREG_FEATURE_VERSION_MASK) { case 1: diff --git a/arch/powerpc/sysdev/mv64x60_pic.c b/arch/powerpc/sysdev/mv64x60_pic.c index 45124a1959d0..8848e99a83f2 100644 --- a/arch/powerpc/sysdev/mv64x60_pic.c +++ b/arch/powerpc/sysdev/mv64x60_pic.c @@ -250,9 +250,8 @@ void __init mv64x60_init_irq(void) paddr = of_translate_address(np, reg); mv64x60_irq_reg_base = ioremap(paddr, reg[1]); - mv64x60_irq_host = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, - MV64x60_NUM_IRQS, - &mv64x60_host_ops, MV64x60_NUM_IRQS); + mv64x60_irq_host = irq_domain_add_linear(np, MV64x60_NUM_IRQS, + &mv64x60_host_ops, NULL); spin_lock_irqsave(&mv64x60_lock, flags); out_le32(mv64x60_gpp_reg_base + MV64x60_GPP_INTR_MASK, diff --git a/arch/powerpc/sysdev/qe_lib/qe_ic.c b/arch/powerpc/sysdev/qe_lib/qe_ic.c index 78e90192c343..e9b3d5cc65d3 100644 --- a/arch/powerpc/sysdev/qe_lib/qe_ic.c +++ b/arch/powerpc/sysdev/qe_lib/qe_ic.c @@ -339,8 +339,8 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags, if (qe_ic == NULL) return; - qe_ic->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR, - NR_QE_IC_INTS, &qe_ic_host_ops, 0); + qe_ic->irqhost = irq_domain_add_linear(node, NR_QE_IC_INTS, + &qe_ic_host_ops, qe_ic); if (qe_ic->irqhost == NULL) { kfree(qe_ic); return; @@ -348,7 +348,6 @@ void __init qe_ic_init(struct device_node *node, unsigned int flags, qe_ic->regs = ioremap(res.start, resource_size(&res)); - qe_ic->irqhost->host_data = qe_ic; qe_ic->hc_irq = qe_ic_irq_chip; qe_ic->virq_high = irq_of_parse_and_map(node, 0); diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index f3757236e666..1be26f4b9c96 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -419,8 +419,7 @@ void __init tsi108_pci_int_init(struct device_node *node) { DBG("Tsi108_pci_int_init: initializing PCI interrupts\n"); - pci_irq_host = irq_alloc_host(node, IRQ_DOMAIN_MAP_LEGACY, - 0, &pci_irq_domain_ops, 0); + pci_irq_host = irq_domain_add_legacy(node, &pci_irq_domain_ops, NULL); if (pci_irq_host == NULL) { printk(KERN_ERR "pci_irq_host: failed to allocate irq domain!\n"); return; diff --git a/arch/powerpc/sysdev/uic.c b/arch/powerpc/sysdev/uic.c index 7eea3a64bdf7..84e59c97391f 100644 --- a/arch/powerpc/sysdev/uic.c +++ b/arch/powerpc/sysdev/uic.c @@ -270,13 +270,11 @@ static struct uic * __init uic_init_one(struct device_node *node) } uic->dcrbase = *dcrreg; - uic->irqhost = irq_alloc_host(node, IRQ_DOMAIN_MAP_LINEAR, - NR_UIC_INTS, &uic_host_ops, -1); + uic->irqhost = irq_domain_add_linear(node, NR_UIC_INTS, &uic_host_ops, + uic); if (! uic->irqhost) return NULL; /* FIXME: panic? */ - uic->irqhost->host_data = uic; - /* Start with all interrupts disabled, level and non-critical */ mtdcr(uic->dcrbase + UIC_ER, 0); mtdcr(uic->dcrbase + UIC_CR, 0); diff --git a/arch/powerpc/sysdev/xics/xics-common.c b/arch/powerpc/sysdev/xics/xics-common.c index fb2e303d25fb..ea5e204e3450 100644 --- a/arch/powerpc/sysdev/xics/xics-common.c +++ b/arch/powerpc/sysdev/xics/xics-common.c @@ -374,8 +374,7 @@ static struct irq_domain_ops xics_host_ops = { static void __init xics_init_host(void) { - xics_host = irq_alloc_host(NULL, IRQ_DOMAIN_MAP_TREE, 0, &xics_host_ops, - XICS_IRQ_SPURIOUS); + xics_host = irq_domain_add_tree(NULL, &xics_host_ops, NULL); BUG_ON(xics_host == NULL); irq_set_default_host(xics_host); } diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c index 92e7d4db9fde..8d73c3c0bee6 100644 --- a/arch/powerpc/sysdev/xilinx_intc.c +++ b/arch/powerpc/sysdev/xilinx_intc.c @@ -201,11 +201,10 @@ xilinx_intc_init(struct device_node *np) out_be32(regs + XINTC_MER, 0x3UL); /* Turn on the Master Enable. */ /* Allocate and initialize an irq_domain structure. */ - irq = irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, XILINX_INTC_MAXIRQS, - &xilinx_intc_ops, -1); + irq = irq_domain_add_linear(np, XILINX_INTC_MAXIRQS, &xilinx_intc_ops, + regs); if (!irq) panic(__FILE__ ": Cannot allocate IRQ host\n"); - irq->host_data = regs; return irq; } diff --git a/drivers/gpio/gpio-mpc8xxx.c b/drivers/gpio/gpio-mpc8xxx.c index 9efd59732eeb..149d9876fca8 100644 --- a/drivers/gpio/gpio-mpc8xxx.c +++ b/drivers/gpio/gpio-mpc8xxx.c @@ -364,9 +364,8 @@ static void __init mpc8xxx_add_controller(struct device_node *np) if (hwirq == NO_IRQ) goto skip_irq; - mpc8xxx_gc->irq = - irq_alloc_host(np, IRQ_DOMAIN_MAP_LINEAR, MPC8XXX_GPIO_PINS, - &mpc8xxx_gpio_irq_ops, MPC8XXX_GPIO_PINS); + mpc8xxx_gc->irq = irq_domain_add_linear(np, MPC8XXX_GPIO_PINS, + &mpc8xxx_gpio_irq_ops, mpc8xxx_gc); if (!mpc8xxx_gc->irq) goto skip_irq; @@ -374,8 +373,6 @@ static void __init mpc8xxx_add_controller(struct device_node *np) if (id) mpc8xxx_gc->of_dev_id_data = id->data; - mpc8xxx_gc->irq->host_data = mpc8xxx_gc; - /* ack and mask all irqs */ out_be32(mm_gc->regs + GPIO_IER, 0xffffffff); out_be32(mm_gc->regs + GPIO_IMR, 0); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 18f4ab002d2e..f95553fa6872 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -95,10 +95,6 @@ struct irq_domain { /* type of reverse mapping_technique */ unsigned int revmap_type; -#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */ -#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ -#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ -#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ union { struct { unsigned int size; @@ -120,11 +116,21 @@ struct irq_domain { #ifdef CONFIG_IRQ_DOMAIN #ifdef CONFIG_PPC -extern struct irq_domain *irq_alloc_host(struct device_node *of_node, - unsigned int revmap_type, - unsigned int revmap_arg, - struct irq_domain_ops *ops, - irq_hw_number_t inval_irq); +struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data); +struct irq_domain *irq_domain_add_linear(struct device_node *of_node, + unsigned int size, + struct irq_domain_ops *ops, + void *host_data); +struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data); +struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data); + + extern struct irq_domain *irq_find_host(struct device_node *node); extern void irq_set_default_host(struct irq_domain *host); extern void irq_set_virq_count(unsigned int count); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 432d292b33f8..acedba1a2651 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -13,6 +13,11 @@ #include #include +#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ +#define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ +#define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ + static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); @@ -27,100 +32,158 @@ static int default_irq_domain_match(struct irq_domain *d, struct device_node *np } /** - * irq_alloc_host() - Allocate a new irq_domain data structure + * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller * @revmap_type: type of reverse mapping to use - * @revmap_arg: for IRQ_DOMAIN_MAP_LINEAR linear only: size of the map * @ops: map/unmap domain callbacks - * @inval_irq: provide a hw number in that domain space that is always invalid + * @host_data: Controller private data pointer * - * Allocates and initialize and irq_domain structure. Note that in the case of - * IRQ_DOMAIN_MAP_LEGACY, the map() callback will be called before this returns - * for all legacy interrupts except 0 (which is always the invalid irq for - * a legacy controller). For a IRQ_DOMAIN_MAP_LINEAR, the map is allocated by - * this call as well. For a IRQ_DOMAIN_MAP_TREE, the radix tree will be - * allocated later during boot automatically (the reverse mapping will use the - * slow path until that happens). + * Allocates and initialize and irq_domain structure. Caller is expected to + * register allocated irq_domain with irq_domain_register(). Returns pointer + * to IRQ domain, or NULL on failure. */ -struct irq_domain *irq_alloc_host(struct device_node *of_node, - unsigned int revmap_type, - unsigned int revmap_arg, - struct irq_domain_ops *ops, - irq_hw_number_t inval_irq) +static struct irq_domain *irq_domain_alloc(struct device_node *of_node, + unsigned int revmap_type, + struct irq_domain_ops *ops, + void *host_data) { - struct irq_domain *domain, *h; - unsigned int size = sizeof(struct irq_domain); - unsigned int i; - unsigned int *rmap; + struct irq_domain *domain; - /* Allocate structure and revmap table if using linear mapping */ - if (revmap_type == IRQ_DOMAIN_MAP_LINEAR) - size += revmap_arg * sizeof(unsigned int); - domain = kzalloc(size, GFP_KERNEL); - if (domain == NULL) + domain = kzalloc(sizeof(*domain), GFP_KERNEL); + if (WARN_ON(!domain)) return NULL; /* Fill structure */ domain->revmap_type = revmap_type; - domain->inval_irq = inval_irq; domain->ops = ops; + domain->host_data = host_data; domain->of_node = of_node_get(of_node); if (domain->ops->match == NULL) domain->ops->match = default_irq_domain_match; + return domain; +} + +static void irq_domain_add(struct irq_domain *domain) +{ + mutex_lock(&irq_domain_mutex); + list_add(&domain->link, &irq_domain_list); + mutex_unlock(&irq_domain_mutex); + pr_debug("irq: Allocated domain of type %d @0x%p\n", + domain->revmap_type, domain); +} + +/** + * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + * + * Note: the map() callback will be called before this function returns + * for all legacy interrupts except 0 (which is always the invalid irq for + * a legacy controller). + */ +struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain, *h; + unsigned int i; + + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); + if (!domain) + return NULL; + mutex_lock(&irq_domain_mutex); /* Make sure only one legacy controller can be created */ - if (revmap_type == IRQ_DOMAIN_MAP_LEGACY) { - list_for_each_entry(h, &irq_domain_list, link) { - if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { - mutex_unlock(&irq_domain_mutex); - of_node_put(domain->of_node); - kfree(domain); - return NULL; - } + list_for_each_entry(h, &irq_domain_list, link) { + if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { + mutex_unlock(&irq_domain_mutex); + of_node_put(domain->of_node); + kfree(domain); + return NULL; } } list_add(&domain->link, &irq_domain_list); mutex_unlock(&irq_domain_mutex); - /* Additional setups per revmap type */ - switch(revmap_type) { - case IRQ_DOMAIN_MAP_LEGACY: - /* 0 is always the invalid number for legacy */ - domain->inval_irq = 0; - /* setup us as the domain for all legacy interrupts */ - for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { - struct irq_data *irq_data = irq_get_irq_data(i); - irq_data->hwirq = i; - irq_data->domain = domain; - - /* Legacy flags are left to default at this point, - * one can then use irq_create_mapping() to - * explicitly change them - */ - ops->map(domain, i, i); - - /* Clear norequest flags */ - irq_clear_status_flags(i, IRQ_NOREQUEST); - } - break; - case IRQ_DOMAIN_MAP_LINEAR: - rmap = (unsigned int *)(domain + 1); - for (i = 0; i < revmap_arg; i++) - rmap[i] = 0; - domain->revmap_data.linear.size = revmap_arg; - domain->revmap_data.linear.revmap = rmap; - break; - case IRQ_DOMAIN_MAP_TREE: - INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); - break; - default: - break; + /* setup us as the domain for all legacy interrupts */ + for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { + struct irq_data *irq_data = irq_get_irq_data(i); + irq_data->hwirq = i; + irq_data->domain = domain; + + /* Legacy flags are left to default at this point, + * one can then use irq_create_mapping() to + * explicitly change them + */ + ops->map(domain, i, i); + + /* Clear norequest flags */ + irq_clear_status_flags(i, IRQ_NOREQUEST); } + return domain; +} - pr_debug("irq: Allocated domain of type %d @0x%p\n", revmap_type, domain); +/** + * irq_domain_add_linear() - Allocate and register a legacy revmap irq_domain. + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * @host_data: Controller private data pointer + */ +struct irq_domain *irq_domain_add_linear(struct device_node *of_node, + unsigned int size, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + unsigned int *revmap; + + revmap = kzalloc(sizeof(*revmap) * size, GFP_KERNEL); + if (WARN_ON(!revmap)) + return NULL; + domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LINEAR, ops, host_data); + if (!domain) { + kfree(revmap); + return NULL; + } + domain->revmap_data.linear.size = size; + domain->revmap_data.linear.revmap = revmap; + irq_domain_add(domain); + return domain; +} + +struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_NOMAP, ops, host_data); + if (domain) + irq_domain_add(domain); + return domain; +} + +/** + * irq_domain_add_tree() + * @of_node: pointer to interrupt controller's device tree node. + * @ops: map/unmap domain callbacks + * + * Note: The radix tree will be allocated later during boot automatically + * (the reverse mapping will use the slow path until that happens). + */ +struct irq_domain *irq_domain_add_tree(struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain = irq_domain_alloc(of_node, + IRQ_DOMAIN_MAP_TREE, ops, host_data); + if (domain) { + INIT_RADIX_TREE(&domain->revmap_data.tree, GFP_KERNEL); + irq_domain_add(domain); + } return domain; } @@ -393,9 +456,6 @@ void irq_dispose_mapping(unsigned int virq) break; } - /* Destroy map */ - irq_data->hwirq = domain->inval_irq; - irq_free_desc(virq); } EXPORT_SYMBOL_GPL(irq_dispose_mapping); -- cgit v1.2.3 From 1bc04f2cf8c2a1feadbd994f50c40bb145bf2989 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:55 -0700 Subject: irq_domain: Add support for base irq and hwirq in legacy mappings Add support for a legacy mapping where irq = (hwirq - first_hwirq + first_irq) so that a controller driver can allocate a fixed range of irq_descs and use a simple calculation to translate back and forth between linux and hw irq numbers. This is needed to use an irq_domain with many of the ARM interrupt controller drivers that manage their own irq_desc allocations. Ultimately the goal is to migrate those drivers to use the linear revmap, but doing it this way allows each driver to be converted separately which makes the migration path easier. This patch generalizes the IRQ_DOMAIN_MAP_LEGACY method to use (first_irq-first_hwirq) as the offset between hwirq and linux irq number, and adds checks to make sure that the hwirq number does not exceed range assigned to the controller. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- arch/powerpc/include/asm/irq.h | 3 -- arch/powerpc/sysdev/i8259.c | 2 +- arch/powerpc/sysdev/tsi108_pci.c | 2 +- include/linux/irqdomain.h | 20 ++++++++- kernel/irq/irqdomain.c | 96 ++++++++++++++++++++++++++-------------- 5 files changed, 85 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/irq.h b/arch/powerpc/include/asm/irq.h index 728cc30d04ea..fe0b09dceb7d 100644 --- a/arch/powerpc/include/asm/irq.h +++ b/arch/powerpc/include/asm/irq.h @@ -36,9 +36,6 @@ extern atomic_t ppc_n_lost_interrupts; /* Total number of virq in the platform */ #define NR_IRQS CONFIG_NR_IRQS -/* Number of irqs reserved for the legacy controller */ -#define NUM_ISA_INTERRUPTS 16 - /* Same thing, used by the generic IRQ code */ #define NR_IRQS_LEGACY NUM_ISA_INTERRUPTS diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c index 573a73bd954a..997df6a7ab5d 100644 --- a/arch/powerpc/sysdev/i8259.c +++ b/arch/powerpc/sysdev/i8259.c @@ -263,7 +263,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr) raw_spin_unlock_irqrestore(&i8259_lock, flags); /* create a legacy host */ - i8259_host = irq_domain_add_legacy(node, &i8259_host_ops, NULL); + i8259_host = irq_domain_add_legacy_isa(node, &i8259_host_ops, NULL); if (i8259_host == NULL) { printk(KERN_ERR "i8259: failed to allocate irq host !\n"); return; diff --git a/arch/powerpc/sysdev/tsi108_pci.c b/arch/powerpc/sysdev/tsi108_pci.c index 1be26f4b9c96..188012c58f7f 100644 --- a/arch/powerpc/sysdev/tsi108_pci.c +++ b/arch/powerpc/sysdev/tsi108_pci.c @@ -419,7 +419,7 @@ void __init tsi108_pci_int_init(struct device_node *node) { DBG("Tsi108_pci_int_init: initializing PCI interrupts\n"); - pci_irq_host = irq_domain_add_legacy(node, &pci_irq_domain_ops, NULL); + pci_irq_host = irq_domain_add_legacy_isa(node, &pci_irq_domain_ops, NULL); if (pci_irq_host == NULL) { printk(KERN_ERR "pci_irq_host: failed to allocate irq domain!\n"); return; diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index f95553fa6872..7fef39ed5523 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -39,6 +39,9 @@ struct device_node; struct irq_domain; struct of_device_id; +/* Number of irqs reserved for a legacy isa controller */ +#define NUM_ISA_INTERRUPTS 16 + /* This type is the placeholder for a hardware interrupt number. It has to * be big enough to enclose whatever representation is used by a given * platform. @@ -96,6 +99,11 @@ struct irq_domain { /* type of reverse mapping_technique */ unsigned int revmap_type; union { + struct { + unsigned int size; + unsigned int first_irq; + irq_hw_number_t first_hwirq; + } legacy; struct { unsigned int size; unsigned int *revmap; @@ -117,6 +125,9 @@ struct irq_domain { #ifdef CONFIG_IRQ_DOMAIN #ifdef CONFIG_PPC struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, struct irq_domain_ops *ops, void *host_data); struct irq_domain *irq_domain_add_linear(struct device_node *of_node, @@ -130,11 +141,18 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, struct irq_domain_ops *ops, void *host_data); - extern struct irq_domain *irq_find_host(struct device_node *node); extern void irq_set_default_host(struct irq_domain *host); extern void irq_set_virq_count(unsigned int count); +static inline struct irq_domain *irq_domain_add_legacy_isa( + struct device_node *of_node, + struct irq_domain_ops *ops, + void *host_data) +{ + return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops, + host_data); +} extern unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index acedba1a2651..c6740d72073e 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -13,7 +13,8 @@ #include #include -#define IRQ_DOMAIN_MAP_LEGACY 0 /* legacy 8259, gets irqs 1..15 */ +#define IRQ_DOMAIN_MAP_LEGACY 0 /* driver allocated fixed range of irqs. + * ie. legacy 8259, gets irqs 1..15 */ #define IRQ_DOMAIN_MAP_NOMAP 1 /* no fast reverse mapping */ #define IRQ_DOMAIN_MAP_LINEAR 2 /* linear map of interrupts */ #define IRQ_DOMAIN_MAP_TREE 3 /* radix tree */ @@ -74,9 +75,25 @@ static void irq_domain_add(struct irq_domain *domain) domain->revmap_type, domain); } +static unsigned int irq_domain_legacy_revmap(struct irq_domain *domain, + irq_hw_number_t hwirq) +{ + irq_hw_number_t first_hwirq = domain->revmap_data.legacy.first_hwirq; + int size = domain->revmap_data.legacy.size; + + if (WARN_ON(hwirq < first_hwirq || hwirq >= first_hwirq + size)) + return 0; + return hwirq - first_hwirq + domain->revmap_data.legacy.first_irq; +} + /** * irq_domain_add_legacy() - Allocate and register a legacy revmap irq_domain. * @of_node: pointer to interrupt controller's device tree node. + * @size: total number of irqs in legacy mapping + * @first_irq: first number of irq block assigned to the domain + * @first_hwirq: first hwirq number to use for the translation. Should normally + * be '0', but a positive integer can be used if the effective + * hwirqs numbering does not begin at zero. * @ops: map/unmap domain callbacks * @host_data: Controller private data pointer * @@ -85,44 +102,64 @@ static void irq_domain_add(struct irq_domain *domain) * a legacy controller). */ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, struct irq_domain_ops *ops, void *host_data) { - struct irq_domain *domain, *h; + struct irq_domain *domain; unsigned int i; domain = irq_domain_alloc(of_node, IRQ_DOMAIN_MAP_LEGACY, ops, host_data); if (!domain) return NULL; + domain->revmap_data.legacy.first_irq = first_irq; + domain->revmap_data.legacy.first_hwirq = first_hwirq; + domain->revmap_data.legacy.size = size; + mutex_lock(&irq_domain_mutex); - /* Make sure only one legacy controller can be created */ - list_for_each_entry(h, &irq_domain_list, link) { - if (WARN_ON(h->revmap_type == IRQ_DOMAIN_MAP_LEGACY)) { + /* Verify that all the irqs are available */ + for (i = 0; i < size; i++) { + int irq = first_irq + i; + struct irq_data *irq_data = irq_get_irq_data(irq); + + if (WARN_ON(!irq_data || irq_data->domain)) { mutex_unlock(&irq_domain_mutex); of_node_put(domain->of_node); kfree(domain); return NULL; } } - list_add(&domain->link, &irq_domain_list); - mutex_unlock(&irq_domain_mutex); - /* setup us as the domain for all legacy interrupts */ - for (i = 1; i < NUM_ISA_INTERRUPTS; i++) { - struct irq_data *irq_data = irq_get_irq_data(i); - irq_data->hwirq = i; + /* Claim all of the irqs before registering a legacy domain */ + for (i = 0; i < size; i++) { + struct irq_data *irq_data = irq_get_irq_data(first_irq + i); + irq_data->hwirq = first_hwirq + i; irq_data->domain = domain; + } + mutex_unlock(&irq_domain_mutex); + + for (i = 0; i < size; i++) { + int irq = first_irq + i; + int hwirq = first_hwirq + i; + + /* IRQ0 gets ignored */ + if (!irq) + continue; /* Legacy flags are left to default at this point, * one can then use irq_create_mapping() to * explicitly change them */ - ops->map(domain, i, i); + ops->map(domain, irq, hwirq); /* Clear norequest flags */ - irq_clear_status_flags(i, IRQ_NOREQUEST); + irq_clear_status_flags(irq, IRQ_NOREQUEST); } + + irq_domain_add(domain); return domain; } @@ -338,24 +375,19 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Get a virtual interrupt number */ - if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) { - /* Handle legacy */ - virq = (unsigned int)hwirq; - if (virq == 0 || virq >= NUM_ISA_INTERRUPTS) - return 0; - return virq; - } else { - /* Allocate a virtual interrupt number */ - hint = hwirq % irq_virq_count; - if (hint == 0) - hint++; - virq = irq_alloc_desc_from(hint, 0); - if (!virq) - virq = irq_alloc_desc_from(1, 0); - if (!virq) { - pr_debug("irq: -> virq allocation failed\n"); - return 0; - } + if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) + return irq_domain_legacy_revmap(domain, hwirq); + + /* Allocate a virtual interrupt number */ + hint = hwirq % irq_virq_count; + if (hint == 0) + hint++; + virq = irq_alloc_desc_from(hint, 0); + if (!virq) + virq = irq_alloc_desc_from(1, 0); + if (!virq) { + pr_debug("irq: -> virq allocation failed\n"); + return 0; } if (irq_setup_virq(domain, virq, hwirq)) { @@ -483,7 +515,7 @@ unsigned int irq_find_mapping(struct irq_domain *domain, /* legacy -> bail early */ if (domain->revmap_type == IRQ_DOMAIN_MAP_LEGACY) - return hwirq; + return irq_domain_legacy_revmap(domain, hwirq); /* Slow path does a linear search of the map */ if (hint == 0) -- cgit v1.2.3 From 75294957be1dee7d22dd7d90bd31334ba410e836 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 14 Feb 2012 14:06:57 -0700 Subject: irq_domain: Remove 'new' irq_domain in favour of the ppc one This patch removes the simplistic implementation of irq_domains and enables the powerpc infrastructure for all irq_domain users. The powerpc infrastructure includes support for complex mappings between Linux and hardware irq numbers, and can manage allocation of irq_descs. This patch also converts the few users of irq_domain_add()/irq_domain_del() to call irq_domain_add_legacy() instead. v3: Fix bug that set up too many irqs in translation range. v2: Fix removal of irq_alloc_descs() call in gic driver Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- arch/arm/common/gic.c | 85 +++++++++---------- arch/arm/common/vic.c | 16 ++-- arch/arm/include/asm/hardware/gic.h | 4 +- arch/arm/include/asm/hardware/vic.h | 2 + arch/arm/mach-exynos/common.c | 2 +- arch/arm/mach-versatile/core.c | 7 +- drivers/mfd/twl-core.c | 14 +--- include/linux/irqdomain.h | 45 +--------- kernel/irq/irqdomain.c | 159 +++--------------------------------- 9 files changed, 71 insertions(+), 263 deletions(-) (limited to 'kernel') diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c index dc19862be0a8..7275d808f76d 100644 --- a/arch/arm/common/gic.c +++ b/arch/arm/common/gic.c @@ -51,7 +51,6 @@ union gic_base { }; struct gic_chip_data { - unsigned int irq_offset; union gic_base dist_base; union gic_base cpu_base; #ifdef CONFIG_CPU_PM @@ -61,9 +60,7 @@ struct gic_chip_data { u32 __percpu *saved_ppi_enable; u32 __percpu *saved_ppi_conf; #endif -#ifdef CONFIG_IRQ_DOMAIN - struct irq_domain domain; -#endif + struct irq_domain *domain; unsigned int gic_irqs; #ifdef CONFIG_GIC_NON_BANKED void __iomem *(*get_base)(union gic_base *); @@ -282,7 +279,7 @@ asmlinkage void __exception_irq_entry gic_handle_irq(struct pt_regs *regs) irqnr = irqstat & ~0x1c00; if (likely(irqnr > 15 && irqnr < 1021)) { - irqnr = irq_domain_to_irq(&gic->domain, irqnr); + irqnr = irq_find_mapping(gic->domain, irqnr); handle_IRQ(irqnr, regs); continue; } @@ -314,8 +311,8 @@ static void gic_handle_cascade_irq(unsigned int irq, struct irq_desc *desc) if (gic_irq == 1023) goto out; - cascade_irq = irq_domain_to_irq(&chip_data->domain, gic_irq); - if (unlikely(gic_irq < 32 || gic_irq > 1020 || cascade_irq >= NR_IRQS)) + cascade_irq = irq_find_mapping(chip_data->domain, gic_irq); + if (unlikely(gic_irq < 32 || gic_irq > 1020)) do_bad_IRQ(cascade_irq, desc); else generic_handle_irq(cascade_irq); @@ -348,10 +345,9 @@ void __init gic_cascade_irq(unsigned int gic_nr, unsigned int irq) static void __init gic_dist_init(struct gic_chip_data *gic) { - unsigned int i, irq; + unsigned int i; u32 cpumask; unsigned int gic_irqs = gic->gic_irqs; - struct irq_domain *domain = &gic->domain; void __iomem *base = gic_data_dist_base(gic); u32 cpu = cpu_logical_map(smp_processor_id()); @@ -386,23 +382,6 @@ static void __init gic_dist_init(struct gic_chip_data *gic) for (i = 32; i < gic_irqs; i += 32) writel_relaxed(0xffffffff, base + GIC_DIST_ENABLE_CLEAR + i * 4 / 32); - /* - * Setup the Linux IRQ subsystem. - */ - irq_domain_for_each_irq(domain, i, irq) { - if (i < 32) { - irq_set_percpu_devid(irq); - irq_set_chip_and_handler(irq, &gic_chip, - handle_percpu_devid_irq); - set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN); - } else { - irq_set_chip_and_handler(irq, &gic_chip, - handle_fasteoi_irq); - set_irq_flags(irq, IRQF_VALID | IRQF_PROBE); - } - irq_set_chip_data(irq, gic); - } - writel_relaxed(1, base + GIC_DIST_CTRL); } @@ -618,7 +597,23 @@ static void __init gic_pm_init(struct gic_chip_data *gic) } #endif -#ifdef CONFIG_OF +static int gic_irq_domain_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hw) +{ + if (hw < 32) { + irq_set_percpu_devid(irq); + irq_set_chip_and_handler(irq, &gic_chip, + handle_percpu_devid_irq); + set_irq_flags(irq, IRQF_VALID | IRQF_NOAUTOEN); + } else { + irq_set_chip_and_handler(irq, &gic_chip, + handle_fasteoi_irq); + set_irq_flags(irq, IRQF_VALID | IRQF_PROBE); + } + irq_set_chip_data(irq, d->host_data); + return 0; +} + static int gic_irq_domain_xlate(struct irq_domain *d, struct device_node *controller, const u32 *intspec, unsigned int intsize, @@ -639,26 +634,23 @@ static int gic_irq_domain_xlate(struct irq_domain *d, *out_type = intspec[2] & IRQ_TYPE_SENSE_MASK; return 0; } -#endif struct irq_domain_ops gic_irq_domain_ops = { -#ifdef CONFIG_OF + .map = gic_irq_domain_map, .xlate = gic_irq_domain_xlate, -#endif }; void __init gic_init_bases(unsigned int gic_nr, int irq_start, void __iomem *dist_base, void __iomem *cpu_base, - u32 percpu_offset) + u32 percpu_offset, struct device_node *node) { + irq_hw_number_t hwirq_base; struct gic_chip_data *gic; - struct irq_domain *domain; - int gic_irqs; + int gic_irqs, irq_base; BUG_ON(gic_nr >= MAX_GIC_NR); gic = &gic_data[gic_nr]; - domain = &gic->domain; #ifdef CONFIG_GIC_NON_BANKED if (percpu_offset) { /* Frankein-GIC without banked registers... */ unsigned int cpu; @@ -694,10 +686,10 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start, * For primary GICs, skip over SGIs. * For secondary GICs, skip over PPIs, too. */ - domain->hwirq_base = 32; + hwirq_base = 32; if (gic_nr == 0) { if ((irq_start & 31) > 0) { - domain->hwirq_base = 16; + hwirq_base = 16; if (irq_start != -1) irq_start = (irq_start & ~31) + 16; } @@ -713,17 +705,17 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start, gic_irqs = 1020; gic->gic_irqs = gic_irqs; - domain->nr_irq = gic_irqs - domain->hwirq_base; - domain->irq_base = irq_alloc_descs(irq_start, 16, domain->nr_irq, - numa_node_id()); - if (IS_ERR_VALUE(domain->irq_base)) { + gic_irqs -= hwirq_base; /* calculate # of irqs to allocate */ + irq_base = irq_alloc_descs(irq_start, 16, gic_irqs, numa_node_id()); + if (IS_ERR_VALUE(irq_base)) { WARN(1, "Cannot allocate irq_descs @ IRQ%d, assuming pre-allocated\n", irq_start); - domain->irq_base = irq_start; + irq_base = irq_start; } - domain->host_data = gic; - domain->ops = &gic_irq_domain_ops; - irq_domain_add(domain); + gic->domain = irq_domain_add_legacy(node, gic_irqs, irq_base, + hwirq_base, &gic_irq_domain_ops, gic); + if (WARN_ON(!gic->domain)) + return; gic_chip.flags |= gic_arch_extn.flags; gic_dist_init(gic); @@ -768,7 +760,6 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent) void __iomem *dist_base; u32 percpu_offset; int irq; - struct irq_domain *domain = &gic_data[gic_cnt].domain; if (WARN_ON(!node)) return -ENODEV; @@ -782,9 +773,7 @@ int __init gic_of_init(struct device_node *node, struct device_node *parent) if (of_property_read_u32(node, "cpu-offset", &percpu_offset)) percpu_offset = 0; - domain->of_node = of_node_get(node); - - gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset); + gic_init_bases(gic_cnt, -1, dist_base, cpu_base, percpu_offset, node); if (parent) { irq = irq_of_parse_and_map(node, 0); diff --git a/arch/arm/common/vic.c b/arch/arm/common/vic.c index dcb004a804c7..7a66311f3066 100644 --- a/arch/arm/common/vic.c +++ b/arch/arm/common/vic.c @@ -56,7 +56,7 @@ struct vic_device { u32 int_enable; u32 soft_int; u32 protect; - struct irq_domain domain; + struct irq_domain *domain; }; /* we cannot allocate memory when VICs are initially registered */ @@ -192,14 +192,8 @@ static void __init vic_register(void __iomem *base, unsigned int irq, v->resume_sources = resume_sources; v->irq = irq; vic_id++; - - v->domain.irq_base = irq; - v->domain.nr_irq = 32; -#ifdef CONFIG_OF_IRQ - v->domain.of_node = of_node_get(node); -#endif /* CONFIG_OF */ - v->domain.ops = &irq_domain_simple_ops; - irq_domain_add(&v->domain); + v->domain = irq_domain_add_legacy(node, 32, irq, 0, + &irq_domain_simple_ops, v); } static void vic_ack_irq(struct irq_data *d) @@ -348,7 +342,7 @@ static void __init vic_init_st(void __iomem *base, unsigned int irq_start, vic_register(base, irq_start, 0, node); } -static void __init __vic_init(void __iomem *base, unsigned int irq_start, +void __init __vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources, u32 resume_sources, struct device_node *node) { @@ -444,7 +438,7 @@ static int handle_one_vic(struct vic_device *vic, struct pt_regs *regs) stat = readl_relaxed(vic->base + VIC_IRQ_STATUS); while (stat) { irq = ffs(stat) - 1; - handle_IRQ(irq_domain_to_irq(&vic->domain, irq), regs); + handle_IRQ(irq_find_mapping(vic->domain, irq), regs); stat &= ~(1 << irq); handled = 1; } diff --git a/arch/arm/include/asm/hardware/gic.h b/arch/arm/include/asm/hardware/gic.h index 4bdfe0018696..4b1ce6cd477f 100644 --- a/arch/arm/include/asm/hardware/gic.h +++ b/arch/arm/include/asm/hardware/gic.h @@ -39,7 +39,7 @@ struct device_node; extern struct irq_chip gic_arch_extn; void gic_init_bases(unsigned int, int, void __iomem *, void __iomem *, - u32 offset); + u32 offset, struct device_node *); int gic_of_init(struct device_node *node, struct device_node *parent); void gic_secondary_init(unsigned int); void gic_handle_irq(struct pt_regs *regs); @@ -49,7 +49,7 @@ void gic_raise_softirq(const struct cpumask *mask, unsigned int irq); static inline void gic_init(unsigned int nr, int start, void __iomem *dist , void __iomem *cpu) { - gic_init_bases(nr, start, dist, cpu, 0); + gic_init_bases(nr, start, dist, cpu, 0, NULL); } #endif diff --git a/arch/arm/include/asm/hardware/vic.h b/arch/arm/include/asm/hardware/vic.h index f42ebd619590..e14af1a1a320 100644 --- a/arch/arm/include/asm/hardware/vic.h +++ b/arch/arm/include/asm/hardware/vic.h @@ -47,6 +47,8 @@ struct device_node; struct pt_regs; +void __vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources, + u32 resume_sources, struct device_node *node); void vic_init(void __iomem *base, unsigned int irq_start, u32 vic_sources, u32 resume_sources); int vic_of_init(struct device_node *node, struct device_node *parent); void vic_handle_irq(struct pt_regs *regs); diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c index c59e18871006..6de298c5d2d3 100644 --- a/arch/arm/mach-exynos/common.c +++ b/arch/arm/mach-exynos/common.c @@ -402,7 +402,7 @@ void __init exynos4_init_irq(void) gic_bank_offset = soc_is_exynos4412() ? 0x4000 : 0x8000; if (!of_have_populated_dt()) - gic_init_bases(0, IRQ_PPI(0), S5P_VA_GIC_DIST, S5P_VA_GIC_CPU, gic_bank_offset); + gic_init_bases(0, IRQ_PPI(0), S5P_VA_GIC_DIST, S5P_VA_GIC_CPU, gic_bank_offset, NULL); #ifdef CONFIG_OF else of_irq_init(exynos4_dt_irq_match); diff --git a/arch/arm/mach-versatile/core.c b/arch/arm/mach-versatile/core.c index 02b7b9303f3b..008ce22b9a06 100644 --- a/arch/arm/mach-versatile/core.c +++ b/arch/arm/mach-versatile/core.c @@ -98,8 +98,11 @@ static const struct of_device_id sic_of_match[] __initconst = { void __init versatile_init_irq(void) { - vic_init(VA_VIC_BASE, IRQ_VIC_START, ~0, 0); - irq_domain_generate_simple(vic_of_match, VERSATILE_VIC_BASE, IRQ_VIC_START); + struct device_node *np; + + np = of_find_matching_node_by_address(NULL, vic_of_match, + VERSATILE_VIC_BASE); + __vic_init(VA_VIC_BASE, IRQ_VIC_START, ~0, 0, np); writel(~0, VA_SIC_BASE + SIC_IRQ_ENABLE_CLEAR); diff --git a/drivers/mfd/twl-core.c b/drivers/mfd/twl-core.c index 49677339ab5f..66f9bffc50f0 100644 --- a/drivers/mfd/twl-core.c +++ b/drivers/mfd/twl-core.c @@ -263,10 +263,6 @@ struct twl_client { static struct twl_client twl_modules[TWL_NUM_SLAVES]; -#ifdef CONFIG_IRQ_DOMAIN -static struct irq_domain domain; -#endif - /* mapping the module id to slave id and base address */ struct twl_mapping { unsigned char sid; /* Slave ID */ @@ -1227,14 +1223,8 @@ twl_probe(struct i2c_client *client, const struct i2c_device_id *id) pdata->irq_base = status; pdata->irq_end = pdata->irq_base + nr_irqs; - -#ifdef CONFIG_IRQ_DOMAIN - domain.irq_base = pdata->irq_base; - domain.nr_irq = nr_irqs; - domain.of_node = of_node_get(node); - domain.ops = &irq_domain_simple_ops; - irq_domain_add(&domain); -#endif + irq_domain_add_legacy(node, nr_irqs, pdata->irq_base, 0, + &irq_domain_simple_ops, NULL); if (i2c_check_functionality(client->adapter, I2C_FUNC_I2C) == 0) { dev_dbg(&client->dev, "can't talk I2C?\n"); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 7fef39ed5523..624e9ac89e79 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -55,9 +55,6 @@ typedef unsigned long irq_hw_number_t; * @map: Create or update a mapping between a virtual irq number and a hw * irq number. This is called only once for a given mapping. * @unmap: Dispose of such a mapping - * @to_irq: (optional) given a local hardware irq number, return the linux - * irq number. If to_irq is not implemented, then the irq_domain - * will use this translation: irq = (domain->irq_base + hwirq) * @xlate: Given a device tree node and interrupt specifier, decode * the hardware irq number and linux irq type value. * @@ -70,7 +67,6 @@ struct irq_domain_ops { int (*match)(struct irq_domain *d, struct device_node *node); int (*map)(struct irq_domain *d, unsigned int virq, irq_hw_number_t hw); void (*unmap)(struct irq_domain *d, unsigned int virq); - unsigned int (*to_irq)(struct irq_domain *d, unsigned long hwirq); int (*xlate)(struct irq_domain *d, struct device_node *node, const u32 *intspec, unsigned int intsize, unsigned long *out_hwirq, unsigned int *out_type); @@ -114,16 +110,11 @@ struct irq_domain { void *host_data; irq_hw_number_t inval_irq; - unsigned int irq_base; - unsigned int nr_irq; - unsigned int hwirq_base; - /* Optional device node pointer */ struct device_node *of_node; }; #ifdef CONFIG_IRQ_DOMAIN -#ifdef CONFIG_PPC struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, unsigned int size, unsigned int first_irq, @@ -153,6 +144,10 @@ static inline struct irq_domain *irq_domain_add_legacy_isa( return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops, host_data); } +extern struct irq_domain *irq_find_host(struct device_node *node); +extern void irq_set_default_host(struct irq_domain *host); +extern void irq_set_virq_count(unsigned int count); + extern unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq); @@ -167,38 +162,7 @@ extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host, extern unsigned int irq_linear_revmap(struct irq_domain *host, irq_hw_number_t hwirq); -#else /* CONFIG_PPC */ - -/** - * irq_domain_to_irq() - Translate from a hardware irq to a linux irq number - * - * Returns the linux irq number associated with a hardware irq. By default, - * the mapping is irq == domain->irq_base + hwirq, but this mapping can - * be overridden if the irq_domain implements a .to_irq() hook. - */ -static inline unsigned int irq_domain_to_irq(struct irq_domain *d, - unsigned long hwirq) -{ - if (d->ops->to_irq) - return d->ops->to_irq(d, hwirq); - if (WARN_ON(hwirq < d->hwirq_base)) - return 0; - return d->irq_base + hwirq - d->hwirq_base; -} - -#define irq_domain_for_each_hwirq(d, hw) \ - for (hw = d->hwirq_base; hw < d->hwirq_base + d->nr_irq; hw++) - -#define irq_domain_for_each_irq(d, hw, irq) \ - for (hw = d->hwirq_base, irq = irq_domain_to_irq(d, hw); \ - hw < d->hwirq_base + d->nr_irq; \ - hw++, irq = irq_domain_to_irq(d, hw)) - -extern void irq_domain_add(struct irq_domain *domain); -extern void irq_domain_del(struct irq_domain *domain); - extern struct irq_domain_ops irq_domain_simple_ops; - #if defined(CONFIG_OF_IRQ) extern void irq_domain_add_simple(struct device_node *controller, int irq_base); extern void irq_domain_generate_simple(const struct of_device_id *match, @@ -207,7 +171,6 @@ extern void irq_domain_generate_simple(const struct of_device_id *match, static inline void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { } #endif /* !CONFIG_OF_IRQ */ -#endif /* !CONFIG_PPC */ #endif /* CONFIG_IRQ_DOMAIN */ #endif /* _LINUX_IRQDOMAIN_H */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index c6740d72073e..2981ebfeb40c 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -22,7 +22,6 @@ static LIST_HEAD(irq_domain_list); static DEFINE_MUTEX(irq_domain_mutex); -#ifdef CONFIG_PPC static DEFINE_MUTEX(revmap_trees_mutex); static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; @@ -694,124 +693,11 @@ static int __init irq_debugfs_init(void) __initcall(irq_debugfs_init); #endif /* CONFIG_VIRQ_DEBUG */ -#else /* CONFIG_PPC */ - -/** - * irq_domain_add() - Register an irq_domain - * @domain: ptr to initialized irq_domain structure - * - * Registers an irq_domain structure. The irq_domain must at a minimum be - * initialized with an ops structure pointer, and either a ->to_irq hook or - * a valid irq_base value. Everything else is optional. - */ -void irq_domain_add(struct irq_domain *domain) -{ - struct irq_data *d; - int hwirq, irq; - - /* - * This assumes that the irq_domain owner has already allocated - * the irq_descs. This block will be removed when support for dynamic - * allocation of irq_descs is added to irq_domain. - */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - if (!d) { - WARN(1, "error: assigning domain to non existant irq_desc"); - return; - } - if (d->domain) { - /* things are broken; just report, don't clean up */ - WARN(1, "error: irq_desc already assigned to a domain"); - return; - } - d->domain = domain; - d->hwirq = hwirq; - } - - mutex_lock(&irq_domain_mutex); - list_add(&domain->link, &irq_domain_list); - mutex_unlock(&irq_domain_mutex); -} - -/** - * irq_domain_del() - Unregister an irq_domain - * @domain: ptr to registered irq_domain. - */ -void irq_domain_del(struct irq_domain *domain) -{ - struct irq_data *d; - int hwirq, irq; - - mutex_lock(&irq_domain_mutex); - list_del(&domain->link); - mutex_unlock(&irq_domain_mutex); - - /* Clear the irq_domain assignments */ - irq_domain_for_each_irq(domain, hwirq, irq) { - d = irq_get_irq_data(irq); - d->domain = NULL; - } -} - -#if defined(CONFIG_OF_IRQ) -/** - * irq_create_of_mapping() - Map a linux irq number from a DT interrupt spec - * - * Used by the device tree interrupt mapping code to translate a device tree - * interrupt specifier to a valid linux irq number. Returns either a valid - * linux IRQ number or 0. - * - * When the caller no longer need the irq number returned by this function it - * should arrange to call irq_dispose_mapping(). - */ -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - struct irq_domain *domain; - unsigned long hwirq; - unsigned int irq, type; - int rc = -EINVAL; - - /* Find a domain which can translate the irq spec */ - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, link) { - if (!domain->ops->xlate) - continue; - rc = domain->ops->xlate(domain, controller, - intspec, intsize, &hwirq, &type); - if (rc == 0) - break; - } - mutex_unlock(&irq_domain_mutex); - - if (rc != 0) - return 0; - - irq = irq_domain_to_irq(domain, hwirq); - if (type != IRQ_TYPE_NONE) - irq_set_irq_type(irq, type); - pr_debug("%s: mapped hwirq=%i to irq=%i, flags=%x\n", - controller->full_name, (int)hwirq, irq, type); - return irq; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - -/** - * irq_dispose_mapping() - Discard a mapping created by irq_create_of_mapping() - * @irq: linux irq number to be discarded - * - * Calling this function indicates the caller no longer needs a reference to - * the linux irq number returned by a prior call to irq_create_of_mapping(). - */ -void irq_dispose_mapping(unsigned int irq) +int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, + irq_hw_number_t hwirq) { - /* - * nothing yet; will be filled when support for dynamic allocation of - * irq_descs is added to irq_domain - */ + return 0; } -EXPORT_SYMBOL_GPL(irq_dispose_mapping); int irq_domain_simple_xlate(struct irq_domain *d, struct device_node *controller, @@ -822,10 +708,6 @@ int irq_domain_simple_xlate(struct irq_domain *d, return -EINVAL; if (intsize < 1) return -EINVAL; - if (d->nr_irq && ((intspec[0] < d->hwirq_base) || - (intspec[0] >= d->hwirq_base + d->nr_irq))) - return -EINVAL; - *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; if (intsize > 1) @@ -833,23 +715,17 @@ int irq_domain_simple_xlate(struct irq_domain *d, return 0; } -/** - * irq_domain_create_simple() - Set up a 'simple' translation range - */ +struct irq_domain_ops irq_domain_simple_ops = { + .map = irq_domain_simple_map, + .xlate = irq_domain_simple_xlate, +}; +EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +#ifdef CONFIG_OF_IRQ void irq_domain_add_simple(struct device_node *controller, int irq_base) { - struct irq_domain *domain; - - domain = kzalloc(sizeof(*domain), GFP_KERNEL); - if (!domain) { - WARN_ON(1); - return; - } - - domain->irq_base = irq_base; - domain->of_node = of_node_get(controller); - domain->ops = &irq_domain_simple_ops; - irq_domain_add(domain); + irq_domain_add_legacy(controller, 32, irq_base, 0, + &irq_domain_simple_ops, NULL); } EXPORT_SYMBOL_GPL(irq_domain_add_simple); @@ -864,13 +740,4 @@ void irq_domain_generate_simple(const struct of_device_id *match, irq_domain_add_simple(node, irq_start); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); -#endif /* CONFIG_OF_IRQ */ - -struct irq_domain_ops irq_domain_simple_ops = { -#ifdef CONFIG_OF_IRQ - .xlate = irq_domain_simple_xlate, -#endif /* CONFIG_OF_IRQ */ -}; -EXPORT_SYMBOL_GPL(irq_domain_simple_ops); - -#endif /* !CONFIG_PPC */ +#endif -- cgit v1.2.3 From 6b783f7c5dde2648fa0bbe7fc8ac80d78699e67f Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 10 Jan 2012 17:09:30 -0700 Subject: irq_domain: Remove irq_domain_add_simple() irq_domain_add_simple() was a stop-gap measure until complete irq_domain support was complete. This patch removes the irq_domain_add_simple() interface. This patch also drops the explicit irq_domain initialization performed by the mach-versatile code because the versatile interrupt controller already has irq_domain support built into it. This was a bug that was hanging around quietly for a while, but with the full irq_domain which actually verifies that irq_domain ranges are available it would cause the registration to fail and the system wouldn't boot. v4: Fixed number of irqs in mx5 gpio code v2: Updated to pass in host_data pointer on irq_domain allocation. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Thomas Gleixner Cc: Milton Miller Cc: Russell King Tested-by: Olof Johansson --- arch/arm/mach-imx/imx51-dt.c | 4 ++-- arch/arm/mach-imx/imx53-dt.c | 4 ++-- arch/arm/mach-imx/mach-imx6q.c | 3 ++- arch/arm/mach-msm/board-msm8x60.c | 8 ++------ arch/arm/mach-omap2/board-generic.c | 2 +- arch/arm/mach-prima2/irq.c | 2 +- include/linux/irqdomain.h | 1 - kernel/irq/irqdomain.c | 10 ++-------- 8 files changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-imx/imx51-dt.c b/arch/arm/mach-imx/imx51-dt.c index e6bad17b908c..1e03ef42faa0 100644 --- a/arch/arm/mach-imx/imx51-dt.c +++ b/arch/arm/mach-imx/imx51-dt.c @@ -47,7 +47,7 @@ static const struct of_dev_auxdata imx51_auxdata_lookup[] __initconst = { static int __init imx51_tzic_add_irq_domain(struct device_node *np, struct device_node *interrupt_parent) { - irq_domain_add_simple(np, 0); + irq_domain_add_legacy(np, 128, 0, 0, &irq_domain_simple_ops, NULL); return 0; } @@ -57,7 +57,7 @@ static int __init imx51_gpio_add_irq_domain(struct device_node *np, static int gpio_irq_base = MXC_GPIO_IRQ_START + ARCH_NR_GPIOS; gpio_irq_base -= 32; - irq_domain_add_simple(np, gpio_irq_base); + irq_domain_add_legacy(np, 32, gpio_irq_base, 0, &irq_domain_simple_ops, NULL); return 0; } diff --git a/arch/arm/mach-imx/imx53-dt.c b/arch/arm/mach-imx/imx53-dt.c index 05ebb3e68679..fd5be0f20fbb 100644 --- a/arch/arm/mach-imx/imx53-dt.c +++ b/arch/arm/mach-imx/imx53-dt.c @@ -51,7 +51,7 @@ static const struct of_dev_auxdata imx53_auxdata_lookup[] __initconst = { static int __init imx53_tzic_add_irq_domain(struct device_node *np, struct device_node *interrupt_parent) { - irq_domain_add_simple(np, 0); + irq_domain_add_legacy(np, 128, 0, 0, &irq_domain_simple_ops, NULL); return 0; } @@ -61,7 +61,7 @@ static int __init imx53_gpio_add_irq_domain(struct device_node *np, static int gpio_irq_base = MXC_GPIO_IRQ_START + ARCH_NR_GPIOS; gpio_irq_base -= 32; - irq_domain_add_simple(np, gpio_irq_base); + irq_domain_add_legacy(np, 32, gpio_irq_base, 0, &irq_domain_simple_ops, NULL); return 0; } diff --git a/arch/arm/mach-imx/mach-imx6q.c b/arch/arm/mach-imx/mach-imx6q.c index c25728106917..6075d4d62dd6 100644 --- a/arch/arm/mach-imx/mach-imx6q.c +++ b/arch/arm/mach-imx/mach-imx6q.c @@ -97,7 +97,8 @@ static int __init imx6q_gpio_add_irq_domain(struct device_node *np, static int gpio_irq_base = MXC_GPIO_IRQ_START + ARCH_NR_GPIOS; gpio_irq_base -= 32; - irq_domain_add_simple(np, gpio_irq_base); + irq_domain_add_legacy(np, 32, gpio_irq_base, 0, &irq_domain_simple_ops, + NULL); return 0; } diff --git a/arch/arm/mach-msm/board-msm8x60.c b/arch/arm/mach-msm/board-msm8x60.c index 0a113424632c..962e71169750 100644 --- a/arch/arm/mach-msm/board-msm8x60.c +++ b/arch/arm/mach-msm/board-msm8x60.c @@ -80,12 +80,8 @@ static struct of_device_id msm_dt_gic_match[] __initdata = { static void __init msm8x60_dt_init(void) { - struct device_node *node; - - node = of_find_matching_node_by_address(NULL, msm_dt_gic_match, - MSM8X60_QGIC_DIST_PHYS); - if (node) - irq_domain_add_simple(node, GIC_SPI_START); + irq_domain_generate_simple(msm_dt_gic_match, MSM8X60_QGIC_DIST_PHYS, + GIC_SPI_START); if (of_machine_is_compatible("qcom,msm8660-surf")) { printk(KERN_INFO "Init surf UART registers\n"); diff --git a/arch/arm/mach-omap2/board-generic.c b/arch/arm/mach-omap2/board-generic.c index d58756060483..00b1d024fa87 100644 --- a/arch/arm/mach-omap2/board-generic.c +++ b/arch/arm/mach-omap2/board-generic.c @@ -67,7 +67,7 @@ static void __init omap_generic_init(void) { struct device_node *node = of_find_matching_node(NULL, intc_match); if (node) - irq_domain_add_simple(node, 0); + irq_domain_add_legacy(node, 32, 0, 0, &irq_domain_simple_ops, NULL); omap_sdrc_init(NULL, NULL); diff --git a/arch/arm/mach-prima2/irq.c b/arch/arm/mach-prima2/irq.c index d93ceef4a50a..37c2de9b6f26 100644 --- a/arch/arm/mach-prima2/irq.c +++ b/arch/arm/mach-prima2/irq.c @@ -68,7 +68,7 @@ void __init sirfsoc_of_irq_init(void) if (!sirfsoc_intc_base) panic("unable to map intc cpu registers\n"); - irq_domain_add_simple(np, 0); + irq_domain_add_legacy(np, 32, 0, 0, &irq_domain_simple_ops, NULL); of_node_put(np); diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 624e9ac89e79..e7379a3c4d7d 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -164,7 +164,6 @@ extern unsigned int irq_linear_revmap(struct irq_domain *host, extern struct irq_domain_ops irq_domain_simple_ops; #if defined(CONFIG_OF_IRQ) -extern void irq_domain_add_simple(struct device_node *controller, int irq_base); extern void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start); #else /* CONFIG_OF_IRQ */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 2981ebfeb40c..6328d9350f04 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -722,13 +722,6 @@ struct irq_domain_ops irq_domain_simple_ops = { EXPORT_SYMBOL_GPL(irq_domain_simple_ops); #ifdef CONFIG_OF_IRQ -void irq_domain_add_simple(struct device_node *controller, int irq_base) -{ - irq_domain_add_legacy(controller, 32, irq_base, 0, - &irq_domain_simple_ops, NULL); -} -EXPORT_SYMBOL_GPL(irq_domain_add_simple); - void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start) { @@ -737,7 +730,8 @@ void irq_domain_generate_simple(const struct of_device_id *match, (unsigned long long) phys_base, (int) irq_start); node = of_find_matching_node_by_address(NULL, match, phys_base); if (node) - irq_domain_add_simple(node, irq_start); + irq_domain_add_legacy(node, 32, irq_start, 0, + &irq_domain_simple_ops, NULL); } EXPORT_SYMBOL_GPL(irq_domain_generate_simple); #endif -- cgit v1.2.3 From 16b2e6e2f31dda41f114aa0acade04f7e10f67c9 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 26 Jan 2012 11:26:52 -0700 Subject: irq_domain: Create common xlate functions that device drivers can use Rather than having each interrupt controller driver creating its own barely unique .xlate function for irq_domain, create a library of translators which any driver can use directly. v5: - Remove irq_domain_xlate_pci(). It was incorrect. Signed-off-by: Grant Likely Cc: Rob Herring Cc: Benjamin Herrenschmidt Cc: Mark Salter Cc: Thomas Gleixner Cc: Milton Miller Tested-by: Olof Johansson --- include/linux/irqdomain.h | 12 +++++++++ kernel/irq/irqdomain.c | 65 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 67 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index e7379a3c4d7d..ea58f36688a0 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -163,6 +163,18 @@ extern unsigned int irq_linear_revmap(struct irq_domain *host, irq_hw_number_t hwirq); extern struct irq_domain_ops irq_domain_simple_ops; + +/* stock xlate functions */ +int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); +int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); +int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); + #if defined(CONFIG_OF_IRQ) extern void irq_domain_generate_simple(const struct of_device_id *match, u64 phys_base, unsigned int irq_start); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6328d9350f04..456e3fc8387f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -699,25 +699,70 @@ int irq_domain_simple_map(struct irq_domain *d, unsigned int irq, return 0; } -int irq_domain_simple_xlate(struct irq_domain *d, - struct device_node *controller, - const u32 *intspec, unsigned int intsize, - unsigned long *out_hwirq, unsigned int *out_type) +/** + * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings + * + * Device Tree IRQ specifier translation function which works with one cell + * bindings where the cell value maps directly to the hwirq number. + */ +int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) { - if (d->of_node != controller) - return -EINVAL; - if (intsize < 1) + if (WARN_ON(intsize < 1)) return -EINVAL; *out_hwirq = intspec[0]; *out_type = IRQ_TYPE_NONE; - if (intsize > 1) - *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; return 0; } +EXPORT_SYMBOL_GPL(irq_domain_xlate_onecell); + +/** + * irq_domain_xlate_twocell() - Generic xlate for direct two cell bindings + * + * Device Tree IRQ specifier translation function which works with two cell + * bindings where the cell values map directly to the hwirq number + * and linux irq flags. + */ +int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 2)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = intspec[1] & IRQ_TYPE_SENSE_MASK; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); + +/** + * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings + * + * Device Tree IRQ specifier translation function which works with either one + * or two cell bindings where the cell values map directly to the hwirq number + * and linux irq flags. + * + * Note: don't use this function unless your interrupt controller explicitly + * supports both one and two cell bindings. For the majority of controllers + * the _onecell() or _twocell() variants above should be used. + */ +int irq_domain_xlate_onetwocell(struct irq_domain *d, + struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (WARN_ON(intsize < 1)) + return -EINVAL; + *out_hwirq = intspec[0]; + *out_type = (intsize > 1) ? intspec[1] : IRQ_TYPE_NONE; + return 0; +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); struct irq_domain_ops irq_domain_simple_ops = { .map = irq_domain_simple_map, - .xlate = irq_domain_simple_xlate, + .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -- cgit v1.2.3 From a18dc81bf58258ac0920bec26b91656cb0140d2a Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Thu, 26 Jan 2012 12:12:14 -0700 Subject: irq_domain: constify irq_domain_ops Make irq_domain_ops pointer a constant to make it safer for multiple instances to share the same ops pointer and change the irq_domain code so that it does not modify the ops. v4: Fix mismatched type reference in powerpc code Signed-off-by: Grant Likely Cc: Thomas Gleixner Cc: Benjamin Herrenschmidt Cc: Milton Miller Tested-by: Olof Johansson --- arch/powerpc/sysdev/mpic_msi.c | 2 +- include/linux/irqdomain.h | 14 +++++++------- kernel/irq/irqdomain.c | 31 +++++++++++++++---------------- 3 files changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/sysdev/mpic_msi.c b/arch/powerpc/sysdev/mpic_msi.c index 00395f40fb5d..0622aa91b18a 100644 --- a/arch/powerpc/sysdev/mpic_msi.c +++ b/arch/powerpc/sysdev/mpic_msi.c @@ -32,7 +32,7 @@ void mpic_msi_reserve_hwirq(struct mpic *mpic, irq_hw_number_t hwirq) static int mpic_msi_reserve_u3_hwirqs(struct mpic *mpic) { irq_hw_number_t hwirq; - struct irq_domain_ops *ops = mpic->irqhost->ops; + const struct irq_domain_ops *ops = mpic->irqhost->ops; struct device_node *np; int flags, index, i; struct of_irq oirq; diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index ea58f36688a0..52454881938a 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -106,7 +106,7 @@ struct irq_domain { } linear; struct radix_tree_root tree; } revmap_data; - struct irq_domain_ops *ops; + const struct irq_domain_ops *ops; void *host_data; irq_hw_number_t inval_irq; @@ -119,17 +119,17 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, unsigned int size, unsigned int first_irq, irq_hw_number_t first_hwirq, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data); struct irq_domain *irq_domain_add_linear(struct device_node *of_node, unsigned int size, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data); struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data); struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data); extern struct irq_domain *irq_find_host(struct device_node *node); @@ -138,7 +138,7 @@ extern void irq_set_virq_count(unsigned int count); static inline struct irq_domain *irq_domain_add_legacy_isa( struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { return irq_domain_add_legacy(of_node, NUM_ISA_INTERRUPTS, 0, 0, ops, @@ -162,7 +162,7 @@ extern unsigned int irq_radix_revmap_lookup(struct irq_domain *host, extern unsigned int irq_linear_revmap(struct irq_domain *host, irq_hw_number_t hwirq); -extern struct irq_domain_ops irq_domain_simple_ops; +extern const struct irq_domain_ops irq_domain_simple_ops; /* stock xlate functions */ int irq_domain_xlate_onecell(struct irq_domain *d, struct device_node *ctrlr, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 456e3fc8387f..25a498eb98a3 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -26,11 +26,6 @@ static DEFINE_MUTEX(revmap_trees_mutex); static unsigned int irq_virq_count = NR_IRQS; static struct irq_domain *irq_default_domain; -static int default_irq_domain_match(struct irq_domain *d, struct device_node *np) -{ - return d->of_node != NULL && d->of_node == np; -} - /** * irq_domain_alloc() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -44,7 +39,7 @@ static int default_irq_domain_match(struct irq_domain *d, struct device_node *np */ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, unsigned int revmap_type, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; @@ -59,9 +54,6 @@ static struct irq_domain *irq_domain_alloc(struct device_node *of_node, domain->host_data = host_data; domain->of_node = of_node_get(of_node); - if (domain->ops->match == NULL) - domain->ops->match = default_irq_domain_match; - return domain; } @@ -104,7 +96,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, unsigned int size, unsigned int first_irq, irq_hw_number_t first_hwirq, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; @@ -170,7 +162,7 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, */ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, unsigned int size, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain; @@ -192,7 +184,7 @@ struct irq_domain *irq_domain_add_linear(struct device_node *of_node, } struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, @@ -211,7 +203,7 @@ struct irq_domain *irq_domain_add_nomap(struct device_node *of_node, * (the reverse mapping will use the slow path until that happens). */ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, - struct irq_domain_ops *ops, + const struct irq_domain_ops *ops, void *host_data) { struct irq_domain *domain = irq_domain_alloc(of_node, @@ -230,6 +222,7 @@ struct irq_domain *irq_domain_add_tree(struct device_node *of_node, struct irq_domain *irq_find_host(struct device_node *node) { struct irq_domain *h, *found = NULL; + int rc; /* We might want to match the legacy controller last since * it might potentially be set to match all interrupts in @@ -237,11 +230,17 @@ struct irq_domain *irq_find_host(struct device_node *node) * yet though... */ mutex_lock(&irq_domain_mutex); - list_for_each_entry(h, &irq_domain_list, link) - if (h->ops->match(h, node)) { + list_for_each_entry(h, &irq_domain_list, link) { + if (h->ops->match) + rc = h->ops->match(h, node); + else + rc = (h->of_node != NULL) && (h->of_node == node); + + if (rc) { found = h; break; } + } mutex_unlock(&irq_domain_mutex); return found; } @@ -760,7 +759,7 @@ int irq_domain_xlate_onetwocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_xlate_onetwocell); -struct irq_domain_ops irq_domain_simple_ops = { +const struct irq_domain_ops irq_domain_simple_ops = { .map = irq_domain_simple_map, .xlate = irq_domain_xlate_onetwocell, }; -- cgit v1.2.3 From 55ae451918ec62e553f11b6118fec157f90c31c3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Feb 2012 16:29:14 +0100 Subject: PM / Sleep: Unify kerneldoc comments in kernel/power/suspend.c The kerneldoc comments in kernel/power/suspend.c are not formatted in the same way and the quality of some of them is questionable. Unify the formatting and improve the contents. Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/suspend.c | 56 ++++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 03bc92b42750..e6b5ef958603 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -37,8 +37,8 @@ const char *const pm_states[PM_SUSPEND_MAX] = { static const struct platform_suspend_ops *suspend_ops; /** - * suspend_set_ops - Set the global suspend method table. - * @ops: Pointer to ops structure. + * suspend_set_ops - Set the global suspend method table. + * @ops: Suspend operations to use. */ void suspend_set_ops(const struct platform_suspend_ops *ops) { @@ -58,11 +58,11 @@ bool valid_state(suspend_state_t state) } /** - * suspend_valid_only_mem - generic memory-only valid callback + * suspend_valid_only_mem - Generic memory-only valid callback. * - * Platform drivers that implement mem suspend only and only need - * to check for that in their .valid callback can use this instead - * of rolling their own .valid callback. + * Platform drivers that implement mem suspend only and only need to check for + * that in their .valid() callback can use this instead of rolling their own + * .valid() callback. */ int suspend_valid_only_mem(suspend_state_t state) { @@ -83,10 +83,11 @@ static int suspend_test(int level) } /** - * suspend_prepare - Do prep work before entering low-power state. + * suspend_prepare - Prepare for entering system sleep state. * - * This is common code that is called for each state that we're entering. - * Run suspend notifiers, allocate a console and stop all processes. + * Common code run for every system sleep state that can be entered (except for + * hibernation). Run suspend notifiers, allocate the "suspend" console and + * freeze processes. */ static int suspend_prepare(void) { @@ -131,9 +132,9 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void) } /** - * suspend_enter - enter the desired system sleep state. - * @state: State to enter - * @wakeup: Returns information that suspend should not be entered again. + * suspend_enter - Make the system enter the given sleep state. + * @state: System sleep state to enter. + * @wakeup: Returns information that the sleep state should not be re-entered. * * This function should be called after devices have been suspended. */ @@ -199,9 +200,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } /** - * suspend_devices_and_enter - suspend devices and enter the desired system - * sleep state. - * @state: state to enter + * suspend_devices_and_enter - Suspend devices and enter system sleep state. + * @state: System sleep state to enter. */ int suspend_devices_and_enter(suspend_state_t state) { @@ -251,10 +251,10 @@ int suspend_devices_and_enter(suspend_state_t state) } /** - * suspend_finish - Do final work before exiting suspend sequence. + * suspend_finish - Clean up before finishing the suspend sequence. * - * Call platform code to clean up, restart processes, and free the - * console that we've allocated. This is not called for suspend-to-disk. + * Call platform code to clean up, restart processes, and free the console that + * we've allocated. This routine is not called for hibernation. */ static void suspend_finish(void) { @@ -265,14 +265,12 @@ static void suspend_finish(void) } /** - * enter_state - Do common work of entering low-power state. - * @state: pm_state structure for state we're entering. + * enter_state - Do common work needed to enter system sleep state. + * @state: System sleep state to enter. * - * Make sure we're the only ones trying to enter a sleep state. Fail - * if someone has beat us to it, since we don't want anything weird to - * happen when we wake up. - * Then, do the setup for suspend, enter the state, and cleaup (after - * we've woken up). + * Make sure that no one else is trying to put the system into a sleep state. + * Fail if that's not the case. Otherwise, prepare for system suspend, make the + * system enter the given sleep state and clean up after wakeup. */ int enter_state(suspend_state_t state) { @@ -310,11 +308,11 @@ int enter_state(suspend_state_t state) } /** - * pm_suspend - Externally visible function for suspending system. - * @state: Enumerated value of state to enter. + * pm_suspend - Externally visible function for suspending the system. + * @state: System sleep state to enter. * - * Determine whether or not value is within range, get state - * structure, and enter (above). + * Check if the value of @state represents one of the supported states, + * execute enter_state() and update system suspend statistics. */ int pm_suspend(suspend_state_t state) { -- cgit v1.2.3 From 93e1ee43a72b11e1b50aab87046c131a836a4456 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Feb 2012 16:29:24 +0100 Subject: PM / Sleep: Make enter_state() in kernel/power/suspend.c static The enter_state() function in kernel/power/suspend.c should be static and state_store() in kernel/power/suspend.c should call pm_suspend() instead of it, so make that happen (which also reduces code duplication related to suspend statistics). Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- kernel/power/main.c | 8 +++----- kernel/power/power.h | 2 -- kernel/power/suspend.c | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index b1e324878d5f..1c12581f1c62 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -291,12 +291,10 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #ifdef CONFIG_SUSPEND for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + error = pm_suspend(state); break; - } - if (state < PM_SUSPEND_MAX && *s) { - error = enter_state(state); - suspend_stats_update(error); + } } #endif diff --git a/kernel/power/power.h b/kernel/power/power.h index 398d42b48e9e..98f3622d7407 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -177,13 +177,11 @@ extern const char *const pm_states[]; extern bool valid_state(suspend_state_t state); extern int suspend_devices_and_enter(suspend_state_t state); -extern int enter_state(suspend_state_t state); #else /* !CONFIG_SUSPEND */ static inline int suspend_devices_and_enter(suspend_state_t state) { return -ENOSYS; } -static inline int enter_state(suspend_state_t state) { return -ENOSYS; } static inline bool valid_state(suspend_state_t state) { return false; } #endif /* !CONFIG_SUSPEND */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index e6b5ef958603..4914358a0543 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -272,7 +272,7 @@ static void suspend_finish(void) * Fail if that's not the case. Otherwise, prepare for system suspend, make the * system enter the given sleep state and clean up after wakeup. */ -int enter_state(suspend_state_t state) +static int enter_state(suspend_state_t state) { int error; -- cgit v1.2.3 From bc25cf508942c56810d4fb623ef27b56ccef7783 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Feb 2012 16:29:33 +0100 Subject: PM / Sleep: Drop suspend_stats_update() Since suspend_stats_update() is only called from pm_suspend(), move its code directly into that function and remove the static inline definition from include/linux/suspend.h. Clean_up pm_suspend() in the process. Signed-off-by: Rafael J. Wysocki Acked-by: Srivatsa S. Bhat --- include/linux/suspend.h | 16 ---------------- kernel/power/suspend.c | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/suspend.h b/include/linux/suspend.h index b90191894441..ac1c114c499d 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -94,22 +94,6 @@ static inline void dpm_save_failed_step(enum suspend_stat_step step) suspend_stats.last_failed_step %= REC_FAILED_NUM; } -/** - * suspend_stats_update - Update success/failure statistics of suspend-to-ram - * - * @error: Value returned by enter_state() function - */ -static inline void suspend_stats_update(int error) -{ - if (error) { - suspend_stats.fail++; - dpm_save_failed_errno(error); - } else { - suspend_stats.success++; - } -} - - /** * struct platform_suspend_ops - Callbacks for managing platform dependent * system sleep states. diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 4914358a0543..88e5c967370d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -316,12 +316,18 @@ static int enter_state(suspend_state_t state) */ int pm_suspend(suspend_state_t state) { - int ret; - if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { - ret = enter_state(state); - suspend_stats_update(ret); - return ret; + int error; + + if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) + return -EINVAL; + + error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else { + suspend_stats.success++; } - return -EINVAL; + return error; } EXPORT_SYMBOL(pm_suspend); -- cgit v1.2.3 From 69f1d475cc80c55121852b3030873cdd407fd31c Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 14 Feb 2012 22:20:52 +0100 Subject: PM / Hibernate: print physical addresses consistently with other parts of kernel Print physical address info in a style consistent with the %pR style used elsewhere in the kernel. Signed-off-by: Bjorn Helgaas Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..8e2e7461375f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -711,9 +711,10 @@ static void mark_nosave_pages(struct memory_bitmap *bm) list_for_each_entry(region, &nosave_regions, list) { unsigned long pfn; - pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", - region->start_pfn << PAGE_SHIFT, - region->end_pfn << PAGE_SHIFT); + pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n", + (unsigned long long) region->start_pfn << PAGE_SHIFT, + ((unsigned long long) region->end_pfn << PAGE_SHIFT) + - 1); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) if (pfn_valid(pfn)) { -- cgit v1.2.3 From 1fd36adcd98c14d2fd97f545293c488775cb2823 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 16 Feb 2012 17:49:54 +0000 Subject: Replace the fd_sets in struct fdtable with an array of unsigned longs Replace the fd_sets in struct fdtable with an array of unsigned longs and then use the standard non-atomic bit operations rather than the FD_* macros. This: (1) Removes the abuses of struct fd_set: (a) Since we don't want to allocate a full fd_set the vast majority of the time, we actually, in effect, just allocate a just-big-enough array of unsigned longs and cast it to an fd_set type - so why bother with the fd_set at all? (b) Some places outside of the core fdtable handling code (such as SELinux) want to look inside the array of unsigned longs hidden inside the fd_set struct for more efficient iteration over the entire set. (2) Eliminates the use of FD_*() macros in the kernel completely. (3) Permits the __FD_*() macros to be deleted entirely where not exposed to userspace. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120216174954.23314.48147.stgit@warthog.procyon.org.uk Signed-off-by: H. Peter Anvin Cc: Al Viro --- fs/exec.c | 4 ++-- fs/file.c | 46 ++++++++++++++++++++++------------------------ fs/select.c | 2 +- include/linux/fdtable.h | 28 ++++++++++------------------ kernel/exit.c | 2 +- security/selinux/hooks.c | 2 +- 6 files changed, 37 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 22cc38d9e79f..cfd5e3047bd8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1026,10 +1026,10 @@ static void flush_old_files(struct files_struct * files) fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->close_on_exec->fds_bits[j]; + set = fdt->close_on_exec[j]; if (!set) continue; - fdt->close_on_exec->fds_bits[j] = 0; + fdt->close_on_exec[j] = 0; spin_unlock(&files->file_lock); for ( ; set ; i++,set >>= 1) { if (set & 1) { diff --git a/fs/file.c b/fs/file.c index 114fea0a2cec..2d479dd8484e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -40,7 +40,7 @@ int sysctl_nr_open_max = 1024 * 1024; /* raised later */ */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); -static void *alloc_fdmem(unsigned int size) +static void *alloc_fdmem(size_t size) { /* * Very large allocations can stress page reclaim, so fall back to @@ -142,7 +142,7 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) static struct fdtable * alloc_fdtable(unsigned int nr) { struct fdtable *fdt; - char *data; + void *data; /* * Figure out how many fds we actually want to support in this fdtable. @@ -172,14 +172,15 @@ static struct fdtable * alloc_fdtable(unsigned int nr) data = alloc_fdmem(nr * sizeof(struct file *)); if (!data) goto out_fdt; - fdt->fd = (struct file **)data; - data = alloc_fdmem(max_t(unsigned int, + fdt->fd = data; + + data = alloc_fdmem(max_t(size_t, 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); if (!data) goto out_arr; - fdt->open_fds = (fd_set *)data; - data += nr / BITS_PER_BYTE; - fdt->close_on_exec = (fd_set *)data; + fdt->open_fds = data; + data += nr / BITS_PER_LONG; + fdt->close_on_exec = data; fdt->next = NULL; return fdt; @@ -275,11 +276,11 @@ static int count_open_files(struct fdtable *fdt) int i; /* Find the last open fd */ - for (i = size/(8*sizeof(long)); i > 0; ) { - if (fdt->open_fds->fds_bits[--i]) + for (i = size / BITS_PER_LONG; i > 0; ) { + if (fdt->open_fds[--i]) break; } - i = (i+1) * 8 * sizeof(long); + i = (i + 1) * BITS_PER_LONG; return i; } @@ -306,8 +307,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; - new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; - new_fdt->open_fds = (fd_set *)&newf->open_fds_init; + new_fdt->close_on_exec = newf->close_on_exec_init; + new_fdt->open_fds = newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; new_fdt->next = NULL; @@ -350,10 +351,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) old_fds = old_fdt->fd; new_fds = new_fdt->fd; - memcpy(new_fdt->open_fds->fds_bits, - old_fdt->open_fds->fds_bits, open_files/8); - memcpy(new_fdt->close_on_exec->fds_bits, - old_fdt->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8); + memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -379,11 +378,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) memset(new_fds, 0, size); if (new_fdt->max_fds > open_files) { - int left = (new_fdt->max_fds-open_files)/8; - int start = open_files / (8 * sizeof(unsigned long)); + int left = (new_fdt->max_fds - open_files) / 8; + int start = open_files / BITS_PER_LONG; - memset(&new_fdt->open_fds->fds_bits[start], 0, left); - memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds[start], 0, left); + memset(&new_fdt->close_on_exec[start], 0, left); } rcu_assign_pointer(newf->fdt, new_fdt); @@ -419,8 +418,8 @@ struct files_struct init_files = { .fdtab = { .max_fds = NR_OPEN_DEFAULT, .fd = &init_files.fd_array[0], - .close_on_exec = (fd_set *)&init_files.close_on_exec_init, - .open_fds = (fd_set *)&init_files.open_fds_init, + .close_on_exec = init_files.close_on_exec_init, + .open_fds = init_files.open_fds_init, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; @@ -443,8 +442,7 @@ repeat: fd = files->next_fd; if (fd < fdt->max_fds) - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fds, fd); + fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd); error = expand_files(files, fd); if (error < 0) diff --git a/fs/select.c b/fs/select.c index d33418fdc858..2e7fbe8a092c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -348,7 +348,7 @@ static int max_select_fd(unsigned long n, fd_set_bits *fds) set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; fdt = files_fdtable(current->files); - open_fds = fdt->open_fds->fds_bits+n; + open_fds = fdt->open_fds + n; max = 0; if (set) { set &= BITS(fds, n); diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index 7675da2c18f7..158a41eed314 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -21,51 +21,43 @@ */ #define NR_OPEN_DEFAULT BITS_PER_LONG -/* - * The embedded_fd_set is a small fd_set, - * suitable for most tasks (which open <= BITS_PER_LONG files) - */ -struct embedded_fd_set { - unsigned long fds_bits[1]; -}; - struct fdtable { unsigned int max_fds; struct file __rcu **fd; /* current fd array */ - fd_set *close_on_exec; - fd_set *open_fds; + unsigned long *close_on_exec; + unsigned long *open_fds; struct rcu_head rcu; struct fdtable *next; }; static inline void __set_close_on_exec(int fd, struct fdtable *fdt) { - FD_SET(fd, fdt->close_on_exec); + __set_bit(fd, fdt->close_on_exec); } static inline void __clear_close_on_exec(int fd, struct fdtable *fdt) { - FD_CLR(fd, fdt->close_on_exec); + __clear_bit(fd, fdt->close_on_exec); } static inline bool close_on_exec(int fd, const struct fdtable *fdt) { - return FD_ISSET(fd, fdt->close_on_exec); + return test_bit(fd, fdt->close_on_exec); } static inline void __set_open_fd(int fd, struct fdtable *fdt) { - FD_SET(fd, fdt->open_fds); + __set_bit(fd, fdt->open_fds); } static inline void __clear_open_fd(int fd, struct fdtable *fdt) { - FD_CLR(fd, fdt->open_fds); + __clear_bit(fd, fdt->open_fds); } static inline bool fd_is_open(int fd, const struct fdtable *fdt) { - return FD_ISSET(fd, fdt->open_fds); + return test_bit(fd, fdt->open_fds); } /* @@ -83,8 +75,8 @@ struct files_struct { */ spinlock_t file_lock ____cacheline_aligned_in_smp; int next_fd; - struct embedded_fd_set close_on_exec_init; - struct embedded_fd_set open_fds_init; + unsigned long close_on_exec_init[1]; + unsigned long open_fds_init[1]; struct file __rcu * fd_array[NR_OPEN_DEFAULT]; }; diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..4db020015f14 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -473,7 +473,7 @@ static void close_files(struct files_struct * files) i = j * __NFDBITS; if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j++]; + set = fdt->open_fds[j++]; while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 6a3683e28426..421c990a20b2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2145,7 +2145,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, fdt = files_fdtable(files); if (i >= fdt->max_fds) break; - set = fdt->open_fds->fds_bits[j]; + set = fdt->open_fds[j]; if (!set) continue; spin_unlock(&files->file_lock); -- cgit v1.2.3 From 6684ba202b5ab2f36d574c72fe50c207d99b3e35 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sun, 19 Feb 2012 17:38:00 -0800 Subject: compat: Add helper functions to read/write struct timeval, timespec Add helper functions to read and write struct timeval and struct timespec from userspace. We already had helper functions for reading and writing struct compat_timespec; add a set of functions to do the same with struct timeval, and add a second suite of functions which can be sensitive to COMPAT_USE_64BIT_TIME and access either 32- or 64-bit time structures. This also exports these helper functions to modules. Rename the existing inlines for converting between struct compat_timeval and native struct timespec so we can have a saner naming convention for the exported functions. Suggested-by: Linus Torvalds Signed-off-by: H. Peter Anvin --- include/linux/compat.h | 16 ++++++++++++ kernel/compat.c | 68 ++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 76 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 1be91c048249..a82e452bbdb9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -87,10 +87,26 @@ typedef struct { compat_sigset_word sig[_COMPAT_NSIG_WORDS]; } compat_sigset_t; +/* + * These functions operate strictly on struct compat_time* + */ extern int get_compat_timespec(struct timespec *, const struct compat_timespec __user *); extern int put_compat_timespec(const struct timespec *, struct compat_timespec __user *); +extern int get_compat_timeval(struct timeval *, + const struct compat_timeval __user *); +extern int put_compat_timeval(const struct timeval *, + struct compat_timeval __user *); +/* + * These functions operate on 32- or 64-bit specs depending on + * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments and the + * naming as compat_get/put_ rather than get/put_compat_. + */ +extern int compat_get_timespec(struct timespec *, const void __user *); +extern int compat_put_timespec(const struct timespec *, void __user *); +extern int compat_get_timeval(struct timeval *, const void __user *); +extern int compat_put_timeval(const struct timeval *, void __user *); struct compat_iovec { compat_uptr_t iov_base; diff --git a/kernel/compat.c b/kernel/compat.c index f346cedfe24d..74ff8498809a 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -31,11 +31,10 @@ #include /* - * Note that the native side is already converted to a timespec, because - * that's what we want anyway. + * Get/set struct timeval with struct timespec on the native side */ -static int compat_get_timeval(struct timespec *o, - struct compat_timeval __user *i) +static int compat_get_timeval_convert(struct timespec *o, + struct compat_timeval __user *i) { long usec; @@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o, return 0; } -static int compat_put_timeval(struct compat_timeval __user *o, - struct timeval *i) +static int compat_put_timeval_convert(struct compat_timeval __user *o, + struct timeval *i) { return (put_user(i->tv_sec, &o->tv_sec) || put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; @@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, if (tv) { struct timeval ktv; do_gettimeofday(&ktv); - if (compat_put_timeval(tv, &ktv)) + if (compat_put_timeval_convert(tv, &ktv)) return -EFAULT; } if (tz) { @@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, struct timezone ktz; if (tv) { - if (compat_get_timeval(&kts, tv)) + if (compat_get_timeval_convert(&kts, tv)) return -EFAULT; } if (tz) { @@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) || + __get_user(tv->tv_sec, &ctv->tv_sec) || + __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(get_compat_timeval); + +int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv) +{ + return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) || + __put_user(tv->tv_sec, &ctv->tv_sec) || + __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; +} +EXPORT_SYMBOL_GPL(put_compat_timeval); + int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || __get_user(ts->tv_sec, &cts->tv_sec) || __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; } +EXPORT_SYMBOL_GPL(get_compat_timespec); int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) { @@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user } EXPORT_SYMBOL_GPL(put_compat_timespec); +int compat_get_timeval(struct timeval *tv, const void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + else + return get_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_get_timeval); + +int compat_put_timeval(const struct timeval *tv, void __user *utv) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + else + return put_compat_timeval(tv, utv); +} +EXPORT_SYMBOL_GPL(compat_put_timeval); + +int compat_get_timespec(struct timespec *ts, const void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + else + return get_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_get_timespec); + +int compat_put_timespec(const struct timespec *ts, void __user *uts) +{ + if (COMPAT_USE_64BIT_TIME) + return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + else + return put_compat_timespec(ts, uts); +} +EXPORT_SYMBOL_GPL(compat_put_timespec); + static long compat_nanosleep_restart(struct restart_block *restart) { struct compat_timespec __user *rmtp; -- cgit v1.2.3 From e404b321dbb2d6e438522b7dce9c1d0c6a8c5275 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Sun, 19 Feb 2012 14:16:07 +0300 Subject: tracing: Don't print an extra separator of flags If __print_flags() is used after another __print_*() function, the temp seq_file buffer will not be empty on entry, and the delimiter will be printed even though there's just one field. We get something like: |S instead of just: S This is because the length of the temp seq buffer is used to determine if the delimiter is printed or not. But this algorithm fails when the seq buffer is not empty on entry, and the delimiter will be printed because it thinks that a previous field was already printed. Link: http://lkml.kernel.org/r/1329650167-480655-1-git-send-email-avagin@openvz.org Signed-off-by: Andrew Vagin Cc: Ingo Molnar Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..3efd718984fb 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -300,7 +300,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long mask; const char *str; const char *ret = p->buffer + p->len; - int i; + int i, first = 1; for (i = 0; flag_array[i].name && flags; i++) { @@ -310,8 +310,10 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, str = flag_array[i].name; flags &= ~mask; - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); + else + first = 0; trace_seq_puts(p, str); } -- cgit v1.2.3 From 5b34926114e39e12005031269613d2b13194aeba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Feb 2012 20:37:32 -0500 Subject: tracing: Don't use p->len field to determine output in __print_*() functions If more than one __print_*() function is used in a tracepoint (__print_flags(), __print_symbols(), etc), then the temp seq buffer will not be zero on entry. Using the temp seq buffer's length to know if data has been printed or not in the current function is incorrect and may produce incorrect results. Currently, no in-tree tracepoint causes this bug, but new ones may be created. Cc: Andrew Vagin Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 3efd718984fb..c5a01873567d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -319,7 +319,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim, /* check for left over flags */ if (flags) { - if (p->len && delim) + if (!first && delim) trace_seq_puts(p, delim); trace_seq_printf(p, "0x%lx", flags); } @@ -346,7 +346,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%lx", val); trace_seq_putc(p, 0); @@ -372,7 +372,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, break; } - if (!p->len) + if (ret == (const char *)(p->buffer + p->len)) trace_seq_printf(p, "0x%llx", val); trace_seq_putc(p, 0); -- cgit v1.2.3 From e248491ac283b516958ca9ab62c8e74b6718bca8 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:48 +0100 Subject: ftrace: Add enable/disable ftrace_ops control interface Adding a way to temporarily enable/disable ftrace_ops. The change follows the same way as 'global' ftrace_ops are done. Introducing 2 global ftrace_ops - control_ops and ftrace_control_list which take over all ftrace_ops registered with FTRACE_OPS_FL_CONTROL flag. In addition new per cpu flag called 'disabled' is also added to ftrace_ops to provide the control information for each cpu. When ftrace_ops with FTRACE_OPS_FL_CONTROL is registered, it is set as disabled for all cpus. The ftrace_control_list contains all the registered 'control' ftrace_ops. The control_ops provides function which iterates ftrace_control_list and does the check for 'disabled' flag on current cpu. Adding 3 inline functions: ftrace_function_local_disable/ftrace_function_local_enable - enable/disable the ftrace_ops on current cpu ftrace_function_local_disabled - get disabled ftrace_ops::disabled value for current cpu Link: http://lkml.kernel.org/r/1329317514-8131-2-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 66 +++++++++++++++++++++++++++++ kernel/trace/ftrace.c | 111 +++++++++++++++++++++++++++++++++++++++++++++---- kernel/trace/trace.h | 2 + 3 files changed, 172 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index f33fb3b041c6..64a309d2fbd6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -31,16 +31,33 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); +/* + * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are + * set in the flags member. + * + * ENABLED - set/unset when ftrace_ops is registered/unregistered + * GLOBAL - set manualy by ftrace_ops user to denote the ftrace_ops + * is part of the global tracers sharing the same filter + * via set_ftrace_* debugfs files. + * DYNAMIC - set when ftrace_ops is registered to denote dynamically + * allocated ftrace_ops which need special care + * CONTROL - set manualy by ftrace_ops user to denote the ftrace_ops + * could be controled by following calls: + * ftrace_function_local_enable + * ftrace_function_local_disable + */ enum { FTRACE_OPS_FL_ENABLED = 1 << 0, FTRACE_OPS_FL_GLOBAL = 1 << 1, FTRACE_OPS_FL_DYNAMIC = 1 << 2, + FTRACE_OPS_FL_CONTROL = 1 << 3, }; struct ftrace_ops { ftrace_func_t func; struct ftrace_ops *next; unsigned long flags; + int __percpu *disabled; #ifdef CONFIG_DYNAMIC_FTRACE struct ftrace_hash *notrace_hash; struct ftrace_hash *filter_hash; @@ -97,6 +114,55 @@ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); void clear_ftrace_function(void); +/** + * ftrace_function_local_enable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_enable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))--; +} + +/** + * ftrace_function_local_disable - enable controlled ftrace_ops on current cpu + * + * This function enables tracing on current cpu by decreasing + * the per cpu control variable. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline void ftrace_function_local_disable(struct ftrace_ops *ops) +{ + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL))) + return; + + (*this_cpu_ptr(ops->disabled))++; +} + +/** + * ftrace_function_local_disabled - returns ftrace_ops disabled value + * on current cpu + * + * This function returns value of ftrace_ops::disabled on current cpu. + * It must be called with preemption disabled and only on ftrace_ops + * registered with FTRACE_OPS_FL_CONTROL. If called without preemption + * disabled, this_cpu_ptr will complain when CONFIG_DEBUG_PREEMPT is enabled. + */ +static inline int ftrace_function_local_disabled(struct ftrace_ops *ops) +{ + WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_CONTROL)); + return *this_cpu_ptr(ops->disabled); +} + extern void ftrace_stub(unsigned long a0, unsigned long a1); #else /* !CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d1499e910fe8..f615f974d90e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,6 +62,8 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 +#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) + /* ftrace_enabled is a method to turn ftrace on or off */ int ftrace_enabled __read_mostly; static int last_ftrace_enabled; @@ -89,12 +91,14 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { }; static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; +static struct ftrace_ops control_ops; static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); @@ -168,6 +172,32 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) } #endif +static void control_ops_disable_all(struct ftrace_ops *ops) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(ops->disabled, cpu) = 1; +} + +static int control_ops_alloc(struct ftrace_ops *ops) +{ + int __percpu *disabled; + + disabled = alloc_percpu(int); + if (!disabled) + return -ENOMEM; + + ops->disabled = disabled; + control_ops_disable_all(ops); + return 0; +} + +static void control_ops_free(struct ftrace_ops *ops) +{ + free_percpu(ops->disabled); +} + static void update_global_ops(void) { ftrace_func_t func; @@ -259,6 +289,26 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } +static void add_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int first = *list == &ftrace_list_end; + add_ftrace_ops(list, ops); + if (first) + add_ftrace_ops(&ftrace_ops_list, main_ops); +} + +static int remove_ftrace_list_ops(struct ftrace_ops **list, + struct ftrace_ops *main_ops, + struct ftrace_ops *ops) +{ + int ret = remove_ftrace_ops(list, ops); + if (!ret && *list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); + return ret; +} + static int __register_ftrace_function(struct ftrace_ops *ops) { if (ftrace_disabled) @@ -270,15 +320,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) return -EBUSY; + /* We don't support both control and global flags set. */ + if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) + return -EINVAL; + if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - int first = ftrace_global_list == &ftrace_list_end; - add_ftrace_ops(&ftrace_global_list, ops); + add_ftrace_list_ops(&ftrace_global_list, &global_ops, ops); ops->flags |= FTRACE_OPS_FL_ENABLED; - if (first) - add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + if (control_ops_alloc(ops)) + return -ENOMEM; + add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); } else add_ftrace_ops(&ftrace_ops_list, ops); @@ -302,11 +357,23 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) return -EINVAL; if (ops->flags & FTRACE_OPS_FL_GLOBAL) { - ret = remove_ftrace_ops(&ftrace_global_list, ops); - if (!ret && ftrace_global_list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + ret = remove_ftrace_list_ops(&ftrace_global_list, + &global_ops, ops); if (!ret) ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else if (ops->flags & FTRACE_OPS_FL_CONTROL) { + ret = remove_ftrace_list_ops(&ftrace_control_list, + &control_ops, ops); + if (!ret) { + /* + * The ftrace_ops is now removed from the list, + * so there'll be no new users. We must ensure + * all current users are done before we free + * the control data. + */ + synchronize_sched(); + control_ops_free(ops); + } } else ret = remove_ftrace_ops(&ftrace_ops_list, ops); @@ -3873,6 +3940,36 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) #endif /* CONFIG_DYNAMIC_FTRACE */ +static void +ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) + return; + + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + trace_recursion_set(TRACE_CONTROL_BIT); + op = rcu_dereference_raw(ftrace_control_list); + while (op != &ftrace_list_end) { + if (!ftrace_function_local_disabled(op) && + ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + + op = rcu_dereference_raw(op->next); + }; + trace_recursion_clear(TRACE_CONTROL_BIT); + preempt_enable_notrace(); +} + +static struct ftrace_ops control_ops = { + .func = ftrace_ops_control_func, +}; + static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) { diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b93ecbadad6d..55c6ea00f28a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -288,6 +288,8 @@ struct tracer { /* for function tracing recursion */ #define TRACE_INTERNAL_BIT (1<<11) #define TRACE_GLOBAL_BIT (1<<12) +#define TRACE_CONTROL_BIT (1<<13) + /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function -- cgit v1.2.3 From ceec0b6fc7cd43b38a40c2d40223f9cd0616f0cd Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:49 +0100 Subject: ftrace, perf: Add open/close tracepoint perf registration actions Adding TRACE_REG_PERF_OPEN and TRACE_REG_PERF_CLOSE to differentiate register/unregister from open/close actions. The register/unregister actions are invoked for the first/last tracepoint user when opening/closing the event. The open/close actions are invoked for each tracepoint user when opening/closing the event. Link: http://lkml.kernel.org/r/1329317514-8131-3-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 6 ++- kernel/trace/trace_event_perf.c | 116 ++++++++++++++++++++++++++-------------- kernel/trace/trace_events.c | 10 ++-- kernel/trace/trace_kprobe.c | 6 ++- kernel/trace/trace_syscalls.c | 14 +++-- 5 files changed, 101 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c3da42dd22ba..195e3606ddd7 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -146,6 +146,8 @@ enum trace_reg { TRACE_REG_UNREGISTER, TRACE_REG_PERF_REGISTER, TRACE_REG_PERF_UNREGISTER, + TRACE_REG_PERF_OPEN, + TRACE_REG_PERF_CLOSE, }; struct ftrace_event_call; @@ -157,7 +159,7 @@ struct ftrace_event_class { void *perf_probe; #endif int (*reg)(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); int (*define_fields)(struct ftrace_event_call *); struct list_head *(*get_fields)(struct ftrace_event_call *); struct list_head fields; @@ -165,7 +167,7 @@ struct ftrace_event_class { }; extern int ftrace_event_reg(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); enum { TRACE_EVENT_FL_ENABLED_BIT, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 19a359d5e6d5..0cfcc37f63de 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -44,23 +44,17 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event, return 0; } -static int perf_trace_event_init(struct ftrace_event_call *tp_event, - struct perf_event *p_event) +static int perf_trace_event_reg(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { struct hlist_head __percpu *list; - int ret; + int ret = -ENOMEM; int cpu; - ret = perf_trace_event_perm(tp_event, p_event); - if (ret) - return ret; - p_event->tp_event = tp_event; if (tp_event->perf_refcount++ > 0) return 0; - ret = -ENOMEM; - list = alloc_percpu(struct hlist_head); if (!list) goto fail; @@ -83,7 +77,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, } } - ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); if (ret) goto fail; @@ -108,6 +102,69 @@ fail: return ret; } +static void perf_trace_event_unreg(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); +} + +static int perf_trace_event_open(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); +} + +static void perf_trace_event_close(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + int ret; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_reg(tp_event, p_event); + if (ret) + return ret; + + ret = perf_trace_event_open(p_event); + if (ret) { + perf_trace_event_unreg(p_event); + return ret; + } + + return 0; +} + int perf_trace_init(struct perf_event *p_event) { struct ftrace_event_call *tp_event; @@ -130,6 +187,14 @@ int perf_trace_init(struct perf_event *p_event) return ret; } +void perf_trace_destroy(struct perf_event *p_event) +{ + mutex_lock(&event_mutex); + perf_trace_event_close(p_event); + perf_trace_event_unreg(p_event); + mutex_unlock(&event_mutex); +} + int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; @@ -154,37 +219,6 @@ void perf_trace_del(struct perf_event *p_event, int flags) hlist_del_rcu(&p_event->hlist_entry); } -void perf_trace_destroy(struct perf_event *p_event) -{ - struct ftrace_event_call *tp_event = p_event->tp_event; - int i; - - mutex_lock(&event_mutex); - if (--tp_event->perf_refcount > 0) - goto out; - - tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); - - /* - * Ensure our callback won't be called anymore. The buffers - * will be freed after that. - */ - tracepoint_synchronize_unregister(); - - free_percpu(tp_event->perf_events); - tp_event->perf_events = NULL; - - if (!--total_ref_count) { - for (i = 0; i < PERF_NR_CONTEXTS; i++) { - free_percpu(perf_trace_buf[i]); - perf_trace_buf[i] = NULL; - } - } -out: - module_put(tp_event->mod); - mutex_unlock(&event_mutex); -} - __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, struct pt_regs *regs, int *rctxp) { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c212a7f934ec..5138fea37908 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -147,7 +147,8 @@ int trace_event_raw_init(struct ftrace_event_call *call) } EXPORT_SYMBOL_GPL(trace_event_raw_init); -int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +int ftrace_event_reg(struct ftrace_event_call *call, + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -170,6 +171,9 @@ int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) call->class->perf_probe, call); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; @@ -209,7 +213,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_stop_cmdline_record(); call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; } - call->class->reg(call, TRACE_REG_UNREGISTER); + call->class->reg(call, TRACE_REG_UNREGISTER, NULL); } break; case 1: @@ -218,7 +222,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, tracing_start_cmdline_record(); call->flags |= TRACE_EVENT_FL_RECORDED_CMD; } - ret = call->class->reg(call, TRACE_REG_REGISTER); + ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 00d527c945a4..5667f8958cc9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1892,7 +1892,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, #endif /* CONFIG_PERF_EVENTS */ static __kprobes -int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +int kprobe_register(struct ftrace_event_call *event, + enum trace_reg type, void *data) { struct trace_probe *tp = (struct trace_probe *)event->data; @@ -1909,6 +1910,9 @@ int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) case TRACE_REG_PERF_UNREGISTER: disable_trace_probe(tp, TP_FLAG_PROFILE); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 43500153dd1e..e23515f51ed4 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -17,9 +17,9 @@ static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type); + enum trace_reg type, void *data); static int syscall_enter_define_fields(struct ftrace_event_call *call); static int syscall_exit_define_fields(struct ftrace_event_call *call); @@ -649,7 +649,7 @@ void perf_sysexit_disable(struct ftrace_event_call *call) #endif /* CONFIG_PERF_EVENTS */ static int syscall_enter_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -664,13 +664,16 @@ static int syscall_enter_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysenter_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; } static int syscall_exit_register(struct ftrace_event_call *event, - enum trace_reg type) + enum trace_reg type, void *data) { switch (type) { case TRACE_REG_REGISTER: @@ -685,6 +688,9 @@ static int syscall_exit_register(struct ftrace_event_call *event, case TRACE_REG_PERF_UNREGISTER: perf_sysexit_disable(event); return 0; + case TRACE_REG_PERF_OPEN: + case TRACE_REG_PERF_CLOSE: + return 0; #endif } return 0; -- cgit v1.2.3 From 489c75c3b333dfda4c8d2b7ad1b00e5da024bfa7 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:50 +0100 Subject: ftrace, perf: Add add/del tracepoint perf registration actions Adding TRACE_REG_PERF_ADD and TRACE_REG_PERF_DEL to handle perf event schedule in/out actions. The add action is invoked for when the perf event is scheduled in, while the del action is invoked when the event is scheduled out. Link: http://lkml.kernel.org/r/1329317514-8131-4-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_event_perf.c | 4 +++- kernel/trace/trace_events.c | 2 ++ kernel/trace/trace_kprobe.c | 2 ++ kernel/trace/trace_syscalls.c | 4 ++++ 5 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 195e3606ddd7..2bf677cb2d6f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -148,6 +148,8 @@ enum trace_reg { TRACE_REG_PERF_UNREGISTER, TRACE_REG_PERF_OPEN, TRACE_REG_PERF_CLOSE, + TRACE_REG_PERF_ADD, + TRACE_REG_PERF_DEL, }; struct ftrace_event_call; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0cfcc37f63de..d72af0b03822 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -211,12 +211,14 @@ int perf_trace_add(struct perf_event *p_event, int flags) list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); - return 0; + return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event); } void perf_trace_del(struct perf_event *p_event, int flags) { + struct ftrace_event_call *tp_event = p_event->tp_event; hlist_del_rcu(&p_event->hlist_entry); + tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5138fea37908..079a93ae8a9d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -173,6 +173,8 @@ int ftrace_event_reg(struct ftrace_event_call *call, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5667f8958cc9..580a05ec926b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1912,6 +1912,8 @@ int kprobe_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index e23515f51ed4..96fc73369099 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -666,6 +666,8 @@ static int syscall_enter_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } @@ -690,6 +692,8 @@ static int syscall_exit_register(struct ftrace_event_call *event, return 0; case TRACE_REG_PERF_OPEN: case TRACE_REG_PERF_CLOSE: + case TRACE_REG_PERF_ADD: + case TRACE_REG_PERF_DEL: return 0; #endif } -- cgit v1.2.3 From e59a0bff3ecf389951e3c9378ddfd00f6448bfaa Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:51 +0100 Subject: ftrace: Add FTRACE_ENTRY_REG macro to allow event registration Adding FTRACE_ENTRY_REG macro so particular ftrace entries could specify registration function and thus become accesible via perf. This will be used in upcomming patch for function trace. Link: http://lkml.kernel.org/r/1329317514-8131-5-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 ++++ kernel/trace/trace_export.c | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55c6ea00f28a..638476af8d7b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -68,6 +68,10 @@ enum trace_type { #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + #include "trace_entries.h" /* diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..f74de861cde9 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -18,6 +18,14 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM ftrace +/* + * The FTRACE_ENTRY_REG macro allows ftrace entry to define register + * function and thus become accesible via perf. + */ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + /* not needed for this file */ #undef __field_struct #define __field_struct(type, item) @@ -152,13 +160,14 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #undef F_printk #define F_printk(fmt, args...) #fmt ", " __stringify(args) -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#undef FTRACE_ENTRY_REG +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn)\ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ + .reg = regfn, \ }; \ \ struct ftrace_event_call __used event_##call = { \ @@ -170,4 +179,9 @@ struct ftrace_event_call __used event_##call = { \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + FTRACE_ENTRY_REG(call, struct_name, etype, \ + PARAMS(tstruct), PARAMS(print), NULL) + #include "trace_entries.h" -- cgit v1.2.3 From ced39002f5ea736b716ae233fb68b26d59783912 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:52 +0100 Subject: ftrace, perf: Add support to use function tracepoint in perf Adding perf registration support for the ftrace function event, so it is now possible to register it via perf interface. The perf_event struct statically contains ftrace_ops as a handle for function tracer. The function tracer is registered/unregistered in open/close actions. To be efficient, we enable/disable ftrace_ops each time the traced process is scheduled in/out (via TRACE_REG_PERF_(ADD|DELL) handlers). This way tracing is enabled only when the process is running. Intentionally using this way instead of the event's hw state PERF_HES_STOPPED, which would not disable the ftrace_ops. It is now possible to use function trace within perf commands like: perf record -e ftrace:function ls perf stat -e ftrace:function ls Allowed only for root. Link: http://lkml.kernel.org/r/1329317514-8131-6-git-send-email-jolsa@redhat.com Acked-by: Frederic Weisbecker Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/perf_event.h | 3 ++ kernel/trace/trace.h | 11 ++++++ kernel/trace/trace_entries.h | 6 ++- kernel/trace/trace_event_perf.c | 86 +++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_export.c | 5 +++ 5 files changed, 109 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..92a056f6d18d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -859,6 +859,9 @@ struct perf_event { #ifdef CONFIG_EVENT_TRACING struct ftrace_event_call *tp_event; struct event_filter *filter; +#ifdef CONFIG_FUNCTION_TRACER + struct ftrace_ops ftrace_ops; +#endif #endif #ifdef CONFIG_CGROUP_PERF diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 638476af8d7b..76a1c5094bbf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -595,6 +595,8 @@ static inline int ftrace_trace_task(struct task_struct *task) static inline int ftrace_is_dead(void) { return 0; } #endif +int ftrace_event_is_function(struct ftrace_event_call *call); + /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found @@ -832,4 +834,13 @@ extern const char *__stop___trace_bprintk_fmt[]; FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" +#ifdef CONFIG_PERF_EVENTS +#ifdef CONFIG_FUNCTION_TRACER +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data); +#else +#define perf_ftrace_event_register NULL +#endif /* CONFIG_FUNCTION_TRACER */ +#endif /* CONFIG_PERF_EVENTS */ + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..47db7eda0531 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -55,7 +55,7 @@ /* * Function trace entry - function address and parent function address: */ -FTRACE_ENTRY(function, ftrace_entry, +FTRACE_ENTRY_REG(function, ftrace_entry, TRACE_FN, @@ -64,7 +64,9 @@ FTRACE_ENTRY(function, ftrace_entry, __field( unsigned long, parent_ip ) ), - F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + + perf_ftrace_event_register ); /* Function call entry */ diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index d72af0b03822..fdeeb5c49627 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -24,6 +24,11 @@ static int total_ref_count; static int perf_trace_event_perm(struct ftrace_event_call *tp_event, struct perf_event *p_event) { + /* The ftrace function trace is allowed only for root. */ + if (ftrace_event_is_function(tp_event) && + perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* No tracing, just counting, so no obvious leak */ if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) return 0; @@ -250,3 +255,84 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, return raw_data; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); + +#ifdef CONFIG_FUNCTION_TRACER +static void +perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_entry *entry; + struct hlist_head *head; + struct pt_regs regs; + int rctx; + +#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ + sizeof(u64)) - sizeof(u32)) + + BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); + + perf_fetch_caller_regs(®s); + + entry = perf_trace_buf_prepare(ENTRY_SIZE, TRACE_FN, NULL, &rctx); + if (!entry) + return; + + entry->ip = ip; + entry->parent_ip = parent_ip; + + head = this_cpu_ptr(event_function.perf_events); + perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, 0, + 1, ®s, head); + +#undef ENTRY_SIZE +} + +static int perf_ftrace_function_register(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + + ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->func = perf_ftrace_function_call; + return register_ftrace_function(ops); +} + +static int perf_ftrace_function_unregister(struct perf_event *event) +{ + struct ftrace_ops *ops = &event->ftrace_ops; + return unregister_ftrace_function(ops); +} + +static void perf_ftrace_function_enable(struct perf_event *event) +{ + ftrace_function_local_enable(&event->ftrace_ops); +} + +static void perf_ftrace_function_disable(struct perf_event *event) +{ + ftrace_function_local_disable(&event->ftrace_ops); +} + +int perf_ftrace_event_register(struct ftrace_event_call *call, + enum trace_reg type, void *data) +{ + switch (type) { + case TRACE_REG_REGISTER: + case TRACE_REG_UNREGISTER: + break; + case TRACE_REG_PERF_REGISTER: + case TRACE_REG_PERF_UNREGISTER: + return 0; + case TRACE_REG_PERF_OPEN: + return perf_ftrace_function_register(data); + case TRACE_REG_PERF_CLOSE: + return perf_ftrace_function_unregister(data); + case TRACE_REG_PERF_ADD: + perf_ftrace_function_enable(data); + return 0; + case TRACE_REG_PERF_DEL: + perf_ftrace_function_disable(data); + return 0; + } + + return -EINVAL; +} +#endif /* CONFIG_FUNCTION_TRACER */ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f74de861cde9..a3dbee6d04cd 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -184,4 +184,9 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; FTRACE_ENTRY_REG(call, struct_name, etype, \ PARAMS(tstruct), PARAMS(print), NULL) +int ftrace_event_is_function(struct ftrace_event_call *call) +{ + return call == &event_function; +} + #include "trace_entries.h" -- cgit v1.2.3 From 02aa3162edaa166a01d193f80ccde890be8b55da Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:53 +0100 Subject: ftrace: Allow to specify filter field type for ftrace events Adding FILTER_TRACE_FN event field type for function tracepoint event, so it can be properly recognized within filtering code. Currently all fields of ftrace subsystem events share the common field type FILTER_OTHER. Since the function trace fields need special care within the filtering code we need to recognize it properly, hence adding the FILTER_TRACE_FN event type. Adding filter parameter to the FTRACE_ENTRY macro, to specify the filter field type for the event. Link: http://lkml.kernel.org/r/1329317514-8131-7-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.h | 23 +++++++++-------- kernel/trace/trace_entries.h | 48 +++++++++++++++++++++++++---------- kernel/trace/trace_events_filter.c | 7 +++++- kernel/trace/trace_export.c | 51 +++++++++++++++++++++----------------- 5 files changed, 83 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2bf677cb2d6f..dd478fc8f9f5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -245,6 +245,7 @@ enum { FILTER_STATIC_STRING, FILTER_DYN_STRING, FILTER_PTR_STRING, + FILTER_TRACE_FN, }; #define EVENT_STORAGE_SIZE 128 diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 76a1c5094bbf..29f93cd434a5 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -56,21 +56,23 @@ enum trace_type { #define F_STRUCT(args...) args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ - struct struct_name { \ - struct trace_entry ent; \ - tstruct \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ } #undef TP_ARGS #define TP_ARGS(args...) args #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -826,12 +828,13 @@ extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ extern struct ftrace_event_call \ __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 47db7eda0531..d91eb0541b3a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -66,6 +66,8 @@ FTRACE_ENTRY_REG(function, ftrace_entry, F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip), + FILTER_TRACE_FN, + perf_ftrace_event_register ); @@ -80,7 +82,9 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, __field_desc( int, graph_ent, depth ) ), - F_printk("--> %lx (%d)", __entry->func, __entry->depth) + F_printk("--> %lx (%d)", __entry->func, __entry->depth), + + FILTER_OTHER ); /* Function return entry */ @@ -100,7 +104,9 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", __entry->func, __entry->depth, __entry->calltime, __entry->rettime, - __entry->depth) + __entry->depth), + + FILTER_OTHER ); /* @@ -129,8 +135,9 @@ FTRACE_ENTRY(context_switch, ctx_switch_entry, F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -148,8 +155,9 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", __entry->prev_pid, __entry->prev_prio, __entry->prev_state, __entry->next_pid, __entry->next_prio, __entry->next_state, - __entry->next_cpu - ) + __entry->next_cpu), + + FILTER_OTHER ); /* @@ -171,7 +179,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); FTRACE_ENTRY(user_stack, userstack_entry, @@ -187,7 +197,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], - __entry->caller[6], __entry->caller[7]) + __entry->caller[6], __entry->caller[7]), + + FILTER_OTHER ); /* @@ -204,7 +216,9 @@ FTRACE_ENTRY(bprint, bprint_entry, ), F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt) + __entry->ip, __entry->fmt), + + FILTER_OTHER ); FTRACE_ENTRY(print, print_entry, @@ -217,7 +231,9 @@ FTRACE_ENTRY(print, print_entry, ), F_printk("%08lx %s", - __entry->ip, __entry->buf) + __entry->ip, __entry->buf), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, @@ -236,7 +252,9 @@ FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, F_printk("%lx %lx %lx %d %x %x", (unsigned long)__entry->phys, __entry->value, __entry->pc, - __entry->map_id, __entry->opcode, __entry->width) + __entry->map_id, __entry->opcode, __entry->width), + + FILTER_OTHER ); FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, @@ -254,7 +272,9 @@ FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, F_printk("%lx %lx %lx %d %x", (unsigned long)__entry->phys, __entry->virt, __entry->len, - __entry->map_id, __entry->opcode) + __entry->map_id, __entry->opcode), + + FILTER_OTHER ); @@ -274,6 +294,8 @@ FTRACE_ENTRY(branch, trace_branch, F_printk("%u:%s:%s (%u)", __entry->line, - __entry->func, __entry->file, __entry->correct) + __entry->func, __entry->file, __entry->correct), + + FILTER_OTHER ); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 76afaee99dbc..3da3d0ec3584 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -899,6 +899,11 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } +static bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + static bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || @@ -986,7 +991,7 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; - } else { + } else if (!is_function_field(field)) { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); else diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index a3dbee6d04cd..7b46c9bd22ae 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -23,8 +23,10 @@ * function and thus become accesible via perf. */ #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, \ + filter, regfn) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) /* not needed for this file */ #undef __field_struct @@ -52,21 +54,22 @@ #define F_printk(fmt, args...) fmt, args #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -struct ____ftrace_##name { \ - tstruct \ -}; \ -static void __always_unused ____ftrace_check_##name(void) \ -{ \ - struct ____ftrace_##name *__entry = NULL; \ - \ - /* force compile-time check on F_printk() */ \ - printk(print); \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ } #undef FTRACE_ENTRY_DUP -#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ - FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print, filter) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print), \ + filter) #include "trace_entries.h" @@ -75,7 +78,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -85,7 +88,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -99,7 +102,7 @@ static void __always_unused ____ftrace_check_##name(void) \ ret = trace_define_field(event_call, event_storage, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ @@ -112,7 +115,7 @@ static void __always_unused ____ftrace_check_##name(void) \ offsetof(typeof(field), \ container.item), \ sizeof(field.container.item), \ - is_signed_type(type), FILTER_OTHER); \ + is_signed_type(type), filter_type); \ if (ret) \ return ret; @@ -120,17 +123,18 @@ static void __always_unused ____ftrace_check_##name(void) \ #define __dynamic_array(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - 0, is_signed_type(type), FILTER_OTHER);\ + 0, is_signed_type(type), filter_type);\ if (ret) \ return ret; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ int \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ int ret; \ + int filter_type = filter; \ \ tstruct; \ \ @@ -161,7 +165,8 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define F_printk(fmt, args...) #fmt ", " __stringify(args) #undef FTRACE_ENTRY_REG -#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, regfn)\ +#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ + regfn) \ \ struct ftrace_event_class event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ @@ -180,9 +185,9 @@ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print, filter) \ FTRACE_ENTRY_REG(call, struct_name, etype, \ - PARAMS(tstruct), PARAMS(print), NULL) + PARAMS(tstruct), PARAMS(print), filter, NULL) int ftrace_event_is_function(struct ftrace_event_call *call) { -- cgit v1.2.3 From 5500fa51199aee770ce53718853732600543619e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 15 Feb 2012 15:51:54 +0100 Subject: ftrace, perf: Add filter support for function trace event Adding support to filter function trace event via perf interface. It is now possible to use filter interface in the perf tool like: perf record -e ftrace:function --filter="(ip == mm_*)" ls The filter syntax is restricted to the the 'ip' field only, and following operators are accepted '==' '!=' '||', ending up with the filter strings like: ip == f1[, ]f2 ... || ip != f3[, ]f4 ... with comma ',' or space ' ' as a function separator. If the space ' ' is used as a separator, the right side of the assignment needs to be enclosed in double quotes '"', e.g.: perf record -e ftrace:function --filter '(ip == do_execve,sys_*,ext*)' ls perf record -e ftrace:function --filter '(ip == "do_execve,sys_*,ext*")' ls perf record -e ftrace:function --filter '(ip == "do_execve sys_* ext*")' ls The '==' operator adds trace filter with same effect as would be added via set_ftrace_filter file. The '!=' operator adds trace filter with same effect as would be added via set_ftrace_notrace file. The right side of the '!=', '==' operators is list of functions or regexp. to be added to filter separated by space. The '||' operator is used for connecting multiple filter definitions together. It is possible to have more than one '==' and '!=' operators within one filter string. Link: http://lkml.kernel.org/r/1329317514-8131-8-git-send-email-jolsa@redhat.com Signed-off-by: Jiri Olsa Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +- kernel/trace/ftrace.c | 6 ++ kernel/trace/trace.h | 2 - kernel/trace/trace_event_perf.c | 4 +- kernel/trace/trace_events_filter.c | 165 +++++++++++++++++++++++++++++++++++-- 5 files changed, 172 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 64a309d2fbd6..72a6cabb4d5b 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -250,6 +250,7 @@ int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); void ftrace_set_global_filter(unsigned char *buf, int len, int reset); void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); +void ftrace_free_filter(struct ftrace_ops *ops); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); @@ -380,9 +381,6 @@ extern void ftrace_enable_daemon(void); #else static inline int skip_trace(unsigned long ip) { return 0; } static inline int ftrace_force_update(void) { return 0; } -static inline void ftrace_set_filter(unsigned char *buf, int len, int reset) -{ -} static inline void ftrace_disable_daemon(void) { } static inline void ftrace_enable_daemon(void) { } static inline void ftrace_release_mod(struct module *mod) {} @@ -406,6 +404,9 @@ static inline int ftrace_text_reserved(void *start, void *end) */ #define ftrace_regex_open(ops, flag, inod, file) ({ -ENODEV; }) #define ftrace_set_early_filter(ops, buf, enable) do { } while (0) +#define ftrace_set_filter(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_set_notrace(ops, buf, len, reset) ({ -ENODEV; }) +#define ftrace_free_filter(ops) do { } while (0) static inline ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { return -ENODEV; } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index f615f974d90e..867bd1dd2dd0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1186,6 +1186,12 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash) call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); } +void ftrace_free_filter(struct ftrace_ops *ops) +{ + free_ftrace_hash(ops->filter_hash); + free_ftrace_hash(ops->notrace_hash); +} + static struct ftrace_hash *alloc_ftrace_hash(int size_bits) { struct ftrace_hash *hash; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 29f93cd434a5..54faec790bc1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -776,9 +776,7 @@ struct filter_pred { u64 val; struct regex regex; unsigned short *ops; -#ifdef CONFIG_FTRACE_STARTUP_TEST struct ftrace_event_field *field; -#endif int offset; int not; int op; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fdeeb5c49627..fee3752ae8f6 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -298,7 +298,9 @@ static int perf_ftrace_function_register(struct perf_event *event) static int perf_ftrace_function_unregister(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - return unregister_ftrace_function(ops); + int ret = unregister_ftrace_function(ops); + ftrace_free_filter(ops); + return ret; } static void perf_ftrace_function_enable(struct perf_event *event) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 3da3d0ec3584..431dba8b7542 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -81,6 +81,7 @@ enum { FILT_ERR_TOO_MANY_PREDS, FILT_ERR_MISSING_FIELD, FILT_ERR_INVALID_FILTER, + FILT_ERR_IP_FIELD_ONLY, }; static char *err_text[] = { @@ -96,6 +97,7 @@ static char *err_text[] = { "Too many terms in predicate expression", "Missing field name and/or value", "Meaningless filter expression", + "Only 'ip' field is supported for function trace", }; struct opstack_op { @@ -991,7 +993,12 @@ static int init_pred(struct filter_parse_state *ps, fn = filter_pred_strloc; else fn = filter_pred_pchar; - } else if (!is_function_field(field)) { + } else if (is_function_field(field)) { + if (strcmp(field->name, "ip")) { + parse_error(ps, FILT_ERR_IP_FIELD_ONLY, 0); + return -EINVAL; + } + } else { if (field->is_signed) ret = strict_strtoll(pred->regex.pattern, 0, &val); else @@ -1338,10 +1345,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, strcpy(pred.regex.pattern, operand2); pred.regex.len = strlen(pred.regex.pattern); - -#ifdef CONFIG_FTRACE_STARTUP_TEST pred.field = field; -#endif return init_pred(ps, field, &pred) ? NULL : &pred; } @@ -1954,6 +1958,148 @@ void ftrace_profile_free_filter(struct perf_event *event) __free_filter(filter); } +struct function_filter_data { + struct ftrace_ops *ops; + int first_filter; + int first_notrace; +}; + +#ifdef CONFIG_FUNCTION_TRACER +static char ** +ftrace_function_filter_re(char *buf, int len, int *count) +{ + char *str, *sep, **re; + + str = kstrndup(buf, len, GFP_KERNEL); + if (!str) + return NULL; + + /* + * The argv_split function takes white space + * as a separator, so convert ',' into spaces. + */ + while ((sep = strchr(str, ','))) + *sep = ' '; + + re = argv_split(GFP_KERNEL, str, count); + kfree(str); + return re; +} + +static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, + int reset, char *re, int len) +{ + int ret; + + if (filter) + ret = ftrace_set_filter(ops, re, len, reset); + else + ret = ftrace_set_notrace(ops, re, len, reset); + + return ret; +} + +static int __ftrace_function_set_filter(int filter, char *buf, int len, + struct function_filter_data *data) +{ + int i, re_cnt, ret; + int *reset; + char **re; + + reset = filter ? &data->first_filter : &data->first_notrace; + + /* + * The 'ip' field could have multiple filters set, separated + * either by space or comma. We first cut the filter and apply + * all pieces separatelly. + */ + re = ftrace_function_filter_re(buf, len, &re_cnt); + if (!re) + return -EINVAL; + + for (i = 0; i < re_cnt; i++) { + ret = ftrace_function_set_regexp(data->ops, filter, *reset, + re[i], strlen(re[i])); + if (ret) + break; + + if (*reset) + *reset = 0; + } + + argv_free(re); + return ret; +} + +static int ftrace_function_check_pred(struct filter_pred *pred, int leaf) +{ + struct ftrace_event_field *field = pred->field; + + if (leaf) { + /* + * Check the leaf predicate for function trace, verify: + * - only '==' and '!=' is used + * - the 'ip' field is used + */ + if ((pred->op != OP_EQ) && (pred->op != OP_NE)) + return -EINVAL; + + if (strcmp(field->name, "ip")) + return -EINVAL; + } else { + /* + * Check the non leaf predicate for function trace, verify: + * - only '||' is used + */ + if (pred->op != OP_OR) + return -EINVAL; + } + + return 0; +} + +static int ftrace_function_set_filter_cb(enum move_type move, + struct filter_pred *pred, + int *err, void *data) +{ + /* Checking the node is valid for function trace. */ + if ((move != MOVE_DOWN) || + (pred->left != FILTER_PRED_INVALID)) { + *err = ftrace_function_check_pred(pred, 0); + } else { + *err = ftrace_function_check_pred(pred, 1); + if (*err) + return WALK_PRED_ABORT; + + *err = __ftrace_function_set_filter(pred->op == OP_EQ, + pred->regex.pattern, + pred->regex.len, + data); + } + + return (*err) ? WALK_PRED_ABORT : WALK_PRED_DEFAULT; +} + +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + struct function_filter_data data = { + .first_filter = 1, + .first_notrace = 1, + .ops = &event->ftrace_ops, + }; + + return walk_pred_tree(filter->preds, filter->root, + ftrace_function_set_filter_cb, &data); +} +#else +static int ftrace_function_set_filter(struct perf_event *event, + struct event_filter *filter) +{ + return -ENODEV; +} +#endif /* CONFIG_FUNCTION_TRACER */ + int ftrace_profile_set_filter(struct perf_event *event, int event_id, char *filter_str) { @@ -1974,9 +2120,16 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, goto out_unlock; err = create_filter(call, filter_str, false, &filter); - if (!err) - event->filter = filter; + if (err) + goto free_filter; + + if (ftrace_event_is_function(call)) + err = ftrace_function_set_filter(event, filter); else + event->filter = filter; + +free_filter: + if (err || ftrace_event_is_function(call)) __free_filter(filter); out_unlock: -- cgit v1.2.3 From fe15d706cfc1cb321dbe2329b04b5ca185edff60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 4 Jan 2012 13:30:33 -0800 Subject: rcu: Add lockdep-RCU checks for simple self-deadlock It is illegal to have a grace period within a same-flavor RCU read-side critical section, so this commit adds lockdep-RCU checks to splat when such abuse is encountered. This commit does not detect more elaborate RCU deadlock situations. These situations might be a job for lockdep enhancements. Signed-off-by: Paul E. McKenney --- kernel/rcutiny.c | 4 ++++ kernel/rcutiny_plugin.h | 5 +++++ kernel/rcutree.c | 8 ++++++++ kernel/rcutree_plugin.h | 4 ++++ kernel/srcu.c | 6 ++++++ 5 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 977296dca0a4..8e00d461911e 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -319,6 +319,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 9cb1ae4aabdd..4b905404a5bd 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -706,6 +706,11 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); + #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!rcu_scheduler_active) return; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 6c4a6722abfd..3cf713a724c1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1816,6 +1816,10 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ void synchronize_sched(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; wait_rcu_gp(call_rcu_sched); @@ -1833,6 +1837,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; wait_rcu_gp(call_rcu_bh); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8bb35d73e1f9..3680b6b35bf3 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -688,6 +688,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; wait_rcu_gp(call_rcu); diff --git a/kernel/srcu.c b/kernel/srcu.c index 0febf61e1aa3..3f99fa0e8ed3 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -172,6 +172,12 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) { int idx; + rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && + !lock_is_held(&rcu_bh_lock_map) && + !lock_is_held(&rcu_lock_map) && + !lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + idx = sp->completed; mutex_lock(&sp->mutex); -- cgit v1.2.3 From 0bb7b59d6e2b8440cd7097097dd4bbfc4d76ed07 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Jan 2012 14:44:39 -0800 Subject: rcu: Add diagnostic for misaligned rcu_head structures The push for energy efficiency will require that RCU tag rcu_head structures to indicate whether or not their invocation is time critical. This tagging is best carried out in the bottom bits of the ->next pointers in the rcu_head structures. This tagging requires that the rcu_head structures be properly aligned, so this commit adds the required diagnostics. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3cf713a724c1..570f7530f4b3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1707,6 +1707,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long flags; struct rcu_data *rdp; + WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ debug_rcu_head_queue(head); head->func = func; head->next = NULL; -- cgit v1.2.3 From 486e259340fc4c60474f2c14703e3b3634bb58ca Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Jan 2012 14:11:30 -0800 Subject: rcu: Avoid waking up CPUs having only kfree_rcu() callbacks When CONFIG_RCU_FAST_NO_HZ is enabled, RCU will allow a given CPU to enter dyntick-idle mode even if it still has RCU callbacks queued. RCU avoids system hangs in this case by scheduling a timer for several jiffies in the future. However, if all of the callbacks on that CPU are from kfree_rcu(), there is no reason to wake the CPU up, as it is not a problem to defer freeing of memory. This commit therefore tracks the number of callbacks on a given CPU that are from kfree_rcu(), and avoids scheduling the timer if all of a given CPU's callbacks are from kfree_rcu(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 +- include/linux/rcutiny.h | 6 ++++ include/linux/rcutree.h | 2 ++ include/trace/events/rcu.h | 63 ++++++++++++++++++++++-------------- kernel/rcu.h | 4 ++- kernel/rcutiny.c | 4 +-- kernel/rcutree.c | 29 ++++++++++------- kernel/rcutree.h | 3 +- kernel/rcutree_plugin.h | 79 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/rcutree_trace.c | 8 ++--- 10 files changed, 153 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 81c04f4348ec..a67d5f1072ea 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -841,7 +841,7 @@ void __kfree_rcu(struct rcu_head *head, unsigned long offset) /* See the kfree_rcu() header comment. */ BUILD_BUG_ON(!__is_kfree_rcu_offset(offset)); - call_rcu(head, (rcu_callback)offset); + kfree_call_rcu(head, (rcu_callback)offset); } /** diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 00b7a5e493d2..51bf29c81485 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -83,6 +83,12 @@ static inline void synchronize_sched_expedited(void) synchronize_sched(); } +static inline void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + call_rcu(head, func); +} + #ifdef CONFIG_TINY_RCU static inline void rcu_preempt_note_context_switch(void) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 73e7195f9997..73892483fd05 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -61,6 +61,8 @@ extern void synchronize_rcu_bh(void); extern void synchronize_sched_expedited(void); extern void synchronize_rcu_expedited(void); +void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); + static inline void synchronize_rcu_bh_expedited(void) { synchronize_sched_expedited(); diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index d2d88bed891b..337099783f37 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -313,19 +313,22 @@ TRACE_EVENT(rcu_prep_idle, /* * Tracepoint for the registration of a single RCU callback function. * The first argument is the type of RCU, the second argument is - * a pointer to the RCU callback itself, and the third element is the - * new RCU callback queue length for the current CPU. + * a pointer to the RCU callback itself, the third element is the + * number of lazy callbacks queued, and the fourth element is the + * total number of callbacks queued. */ TRACE_EVENT(rcu_callback, - TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen), + TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen_lazy, + long qlen), - TP_ARGS(rcuname, rhp, qlen), + TP_ARGS(rcuname, rhp, qlen_lazy, qlen), TP_STRUCT__entry( __field(char *, rcuname) __field(void *, rhp) __field(void *, func) + __field(long, qlen_lazy) __field(long, qlen) ), @@ -333,11 +336,13 @@ TRACE_EVENT(rcu_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->func = rhp->func; + __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%pf %ld", - __entry->rcuname, __entry->rhp, __entry->func, __entry->qlen) + TP_printk("%s rhp=%p func=%pf %ld/%ld", + __entry->rcuname, __entry->rhp, __entry->func, + __entry->qlen_lazy, __entry->qlen) ); /* @@ -345,20 +350,21 @@ TRACE_EVENT(rcu_callback, * kfree() form. The first argument is the RCU type, the second argument * is a pointer to the RCU callback, the third argument is the offset * of the callback within the enclosing RCU-protected data structure, - * and the fourth argument is the new RCU callback queue length for the - * current CPU. + * the fourth argument is the number of lazy callbacks queued, and the + * fifth argument is the total number of callbacks queued. */ TRACE_EVENT(rcu_kfree_callback, TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), + long qlen_lazy, long qlen), - TP_ARGS(rcuname, rhp, offset, qlen), + TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen), TP_STRUCT__entry( __field(char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) + __field(long, qlen_lazy) __field(long, qlen) ), @@ -366,41 +372,45 @@ TRACE_EVENT(rcu_kfree_callback, __entry->rcuname = rcuname; __entry->rhp = rhp; __entry->offset = offset; + __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; ), - TP_printk("%s rhp=%p func=%ld %ld", + TP_printk("%s rhp=%p func=%ld %ld/%ld", __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) + __entry->qlen_lazy, __entry->qlen) ); /* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, - * the second is the total number of callbacks (including those that - * are not yet ready to be invoked), and the third argument is the - * current RCU-callback batch limit. + * the second is the number of lazy callbacks queued, the third is + * the total number of callbacks queued, and the fourth argument is + * the current RCU-callback batch limit. */ TRACE_EVENT(rcu_batch_start, - TP_PROTO(char *rcuname, long qlen, int blimit), + TP_PROTO(char *rcuname, long qlen_lazy, long qlen, int blimit), - TP_ARGS(rcuname, qlen, blimit), + TP_ARGS(rcuname, qlen_lazy, qlen, blimit), TP_STRUCT__entry( __field(char *, rcuname) + __field(long, qlen_lazy) __field(long, qlen) __field(int, blimit) ), TP_fast_assign( __entry->rcuname = rcuname; + __entry->qlen_lazy = qlen_lazy; __entry->qlen = qlen; __entry->blimit = blimit; ), - TP_printk("%s CBs=%ld bl=%d", - __entry->rcuname, __entry->qlen, __entry->blimit) + TP_printk("%s CBs=%ld/%ld bl=%d", + __entry->rcuname, __entry->qlen_lazy, __entry->qlen, + __entry->blimit) ); /* @@ -531,16 +541,21 @@ TRACE_EVENT(rcu_torture_read, #else /* #ifdef CONFIG_RCU_TRACE */ #define trace_rcu_grace_period(rcuname, gpnum, gpevent) do { } while (0) -#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, qsmask) do { } while (0) +#define trace_rcu_grace_period_init(rcuname, gpnum, level, grplo, grphi, \ + qsmask) do { } while (0) #define trace_rcu_preempt_task(rcuname, pid, gpnum) do { } while (0) #define trace_rcu_unlock_preempted_task(rcuname, gpnum, pid) do { } while (0) -#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks) do { } while (0) +#define trace_rcu_quiescent_state_report(rcuname, gpnum, mask, qsmask, level, \ + grplo, grphi, gp_tasks) do { } \ + while (0) #define trace_rcu_fqs(rcuname, gpnum, cpu, qsevent) do { } while (0) #define trace_rcu_dyntick(polarity, oldnesting, newnesting) do { } while (0) #define trace_rcu_prep_idle(reason) do { } while (0) -#define trace_rcu_callback(rcuname, rhp, qlen) do { } while (0) -#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen) do { } while (0) -#define trace_rcu_batch_start(rcuname, qlen, blimit) do { } while (0) +#define trace_rcu_callback(rcuname, rhp, qlen_lazy, qlen) do { } while (0) +#define trace_rcu_kfree_callback(rcuname, rhp, offset, qlen_lazy, qlen) \ + do { } while (0) +#define trace_rcu_batch_start(rcuname, qlen_lazy, qlen, blimit) \ + do { } while (0) #define trace_rcu_invoke_callback(rcuname, rhp) do { } while (0) #define trace_rcu_invoke_kfree_callback(rcuname, rhp, offset) do { } while (0) #define trace_rcu_batch_end(rcuname, callbacks_invoked, cb, nr, iit, risk) \ diff --git a/kernel/rcu.h b/kernel/rcu.h index aa88baab5f78..a074b0b43fc2 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -76,16 +76,18 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) extern void kfree(const void *); -static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; if (__is_kfree_rcu_offset(offset)) { RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); kfree((void *)head - offset); + return 1; } else { RCU_TRACE(trace_rcu_invoke_callback(rn, head)); head->func(head); + return 0; } } diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 8e00d461911e..4eb34fcc2a75 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -258,7 +258,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* If no RCU callbacks ready to invoke, just return. */ if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, ACCESS_ONCE(rcp->rcucblist), need_resched(), @@ -269,7 +269,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 570f7530f4b3..acf2d67ad2f4 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1261,6 +1261,7 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen_lazy += rdp->qlen_lazy; receive_rdp->qlen += rdp->qlen; receive_rdp->n_cbs_adopted += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; @@ -1268,6 +1269,7 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; } @@ -1368,11 +1370,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int bl, count; + int bl, count, count_lazy; /* If no callbacks are ready, just return.*/ if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -1385,7 +1387,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ local_irq_save(flags); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen, bl); + trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1396,12 +1398,13 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); /* Invoke callbacks. */ - count = 0; + count = count_lazy = 0; while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(rsp->name, list); + if (__rcu_reclaim(rsp->name, list)) + count_lazy++; list = next; /* Stop only if limit reached and CPU has something to do. */ if (++count >= bl && @@ -1416,6 +1419,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rcu_is_callbacks_kthread()); /* Update count, and requeue any remaining callbacks. */ + rdp->qlen_lazy -= count_lazy; rdp->qlen -= count; rdp->n_cbs_invoked += count; if (list != NULL) { @@ -1702,7 +1706,7 @@ static void invoke_rcu_core(void) static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), - struct rcu_state *rsp) + struct rcu_state *rsp, bool lazy) { unsigned long flags; struct rcu_data *rdp; @@ -1727,12 +1731,14 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), *rdp->nxttail[RCU_NEXT_TAIL] = head; rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (lazy) + rdp->qlen_lazy++; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen); + rdp->qlen_lazy, rdp->qlen); else - trace_rcu_callback(rsp->name, head, rdp->qlen); + trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { @@ -1779,16 +1785,16 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), */ void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_sched_state); + __call_rcu(head, func, &rcu_sched_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_sched); /* - * Queue an RCU for invocation after a quicker grace period. + * Queue an RCU callback for invocation after a quicker grace period. */ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_bh_state); + __call_rcu(head, func, &rcu_bh_state, 0); } EXPORT_SYMBOL_GPL(call_rcu_bh); @@ -2036,6 +2042,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index fddff92d6676..af2af3cc5e65 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -265,7 +265,8 @@ struct rcu_data { */ struct rcu_head *nxtlist; struct rcu_head **nxttail[RCU_NEXT_SIZE]; - long qlen; /* # of queued callbacks */ + long qlen_lazy; /* # of lazy queued callbacks */ + long qlen; /* # of queued callbacks, incl lazy */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 3680b6b35bf3..7adf232bb66b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -671,10 +671,24 @@ static void rcu_preempt_do_callbacks(void) */ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) { - __call_rcu(head, func, &rcu_preempt_state); + __call_rcu(head, func, &rcu_preempt_state, 0); } EXPORT_SYMBOL_GPL(call_rcu); +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -1064,6 +1078,22 @@ static void rcu_preempt_process_callbacks(void) { } +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + * + * Because there is no preemptible RCU, we use RCU-sched instead. + */ +void kfree_call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + /* * Wait for an rcu-preempt grace period, but make it happen quickly. * But because preemptible RCU does not exist, map to rcu-sched. @@ -2051,6 +2081,48 @@ int rcu_needs_cpu(int cpu) return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies; } +/* + * Does the specified flavor of RCU have non-lazy callbacks pending on + * the specified CPU? Both RCU flavor and CPU are specified by the + * rcu_data structure. + */ +static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp) +{ + return rdp->qlen != rdp->qlen_lazy; +} + +#ifdef CONFIG_TREE_PREEMPT_RCU + +/* + * Are there non-lazy RCU-preempt callbacks? (There cannot be if there + * is no RCU-preempt in the kernel.) + */ +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + + return __rcu_cpu_has_nonlazy_callbacks(rdp); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu) +{ + return 0; +} + +#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +/* + * Does any flavor of RCU have non-lazy callbacks on the specified CPU? + */ +static bool rcu_cpu_has_nonlazy_callbacks(int cpu) +{ + return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) || + __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) || + rcu_preempt_cpu_has_nonlazy_callbacks(cpu); +} + /* * Timer handler used to force CPU to start pushing its remaining RCU * callbacks in the case where it entered dyntick-idle mode with callbacks @@ -2149,8 +2221,9 @@ static void rcu_prepare_for_idle(int cpu) trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; - hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), - rcu_idle_gp_wait, HRTIMER_MODE_REL); + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_gp_wait, HRTIMER_MODE_REL); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 654cfe67f0d1..db0987c1e1bd 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -73,8 +73,8 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, " ql=%ld qs=%c%c%c%c", - rdp->qlen, + seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", + rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -145,7 +145,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); - seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, + seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], ".R"[rdp->nxttail[RCU_WAIT_TAIL] != @@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); + seq_puts(m, "\"of\",\"ri\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3 From e5601400081651060a59bd1f45f2821bb8e97f95 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 7 Jan 2012 11:03:57 -0800 Subject: rcu: Simplify offline processing Move ->qsmaskinit and blkd_tasks[] manipulation to the CPU_DYING notifier. This simplifies the code by eliminating a potential deadlock and by reducing the responsibilities of force_quiescent_state(). Also rename functions to make their connection to the CPU-hotplug stages explicit. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 160 +++++++++++++++++++++++------------------------- kernel/rcutree.h | 4 +- kernel/rcutree_plugin.h | 25 ++++---- 3 files changed, 90 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index acf2d67ad2f4..575f91d03f06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -943,6 +943,10 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat * in preparation for detecting the next grace period. The caller must hold * the root node's ->lock, which is released before return. Hard irqs must * be disabled. + * + * Note that it is legal for a dying CPU (which is marked as offline) to + * invoke this function. This can happen when the dying CPU reports its + * quiescent state. */ static void rcu_start_gp(struct rcu_state *rsp, unsigned long flags) @@ -1245,118 +1249,101 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) /* * Move a dying CPU's RCU callbacks to online CPU's callback list. - * Synchronization is not required because this function executes - * in stop_machine() context. + * Also record a quiescent state for this CPU for the current grace period. + * Synchronization and interrupt disabling are not required because + * this function executes in stop_machine() context. Therefore, cleanup + * operations that might block must be done later from the CPU_DEAD + * notifier. + * + * Note that the outgoing CPU's bit has already been cleared in the + * cpu_online_mask. This allows us to randomly pick a callback + * destination from the bits set in that mask. */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { + unsigned long flags; int i; - /* current DYING CPU is cleared in the cpu_online_mask */ + unsigned long mask; + int need_report; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); + struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ + + /* Move callbacks to some other CPU. */ + if (rdp->nxtlist != NULL) { + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = + rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen_lazy += rdp->qlen_lazy; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen_lazy = 0; + rdp->qlen = 0; + } - if (rdp->nxtlist == NULL) - return; /* irqs disabled, so comparison is stable. */ - - *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; - receive_rdp->n_cbs_adopted += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen_lazy = 0; - rdp->qlen = 0; -} - -/* - * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy - * and move all callbacks from the outgoing CPU to the current one. - * There can only be one CPU hotplug operation at a time, so no other - * CPU can be attempting to update rcu_cpu_kthread_task. - */ -static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - int need_report = 0; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp; - - rcu_stop_cpu_kthread(cpu); - - /* Exclude any attempts to start a new grace period. */ - raw_spin_lock_irqsave(&rsp->onofflock, flags); - - /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ - rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ + /* Record a quiescent state for the dying CPU. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - !!(rnp->qsmask & mask), + "cpuofl"); + rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); + /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ + + /* + * Remove the dying CPU from the bitmasks in the rcu_node + * hierarchy. Because we are in stop_machine() context, we + * automatically exclude ->onofflock critical sections. + */ do { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock_irqsave(&rnp->lock, flags); rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { - if (rnp != rdp->mynode) - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - else - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); + raw_spin_unlock_irqrestore(&rnp->lock, flags); break; } if (rnp == rdp->mynode) { - trace_rcu_grace_period(rsp->name, - rnp->gpnum + 1 - - !!(rnp->qsmask & mask), - "cpuofl"); need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); } else - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); mask = rnp->grpmask; rnp = rnp->parent; } while (rnp != NULL); - - /* - * We still hold the leaf rcu_node structure lock here, and - * irqs are still disabled. The reason for this subterfuge is - * because invoking rcu_report_unblock_qs_rnp() with ->onofflock - * held leads to deadlock. - */ - raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ - rnp = rdp->mynode; - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - rcu_node_kthread_setaffinity(rnp, -1); } /* - * Remove the specified CPU from the RCU hierarchy and move any pending - * callbacks that it might have to the current CPU. This code assumes - * that at least one CPU in the system will remain running at all times. - * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. + * The CPU has been completely removed, and some other CPU is reporting + * this fact from process context. Do the remainder of the cleanup. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. */ -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { - __rcu_offline_cpu(cpu, &rcu_sched_state); - __rcu_offline_cpu(cpu, &rcu_bh_state); - rcu_preempt_offline_cpu(cpu); + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + rcu_stop_cpu_kthread(cpu); + rcu_node_kthread_setaffinity(rnp, -1); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ -static void rcu_send_cbs_to_online(struct rcu_state *rsp) +static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { } -static void rcu_offline_cpu(int cpu) +static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -1725,6 +1712,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * a quiescent state betweentimes. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ @@ -2155,16 +2143,18 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, * touch any data without introducing corruption. We send the * dying CPU's callbacks to an arbitrarily chosen online CPU. */ - rcu_send_cbs_to_online(&rcu_bh_state); - rcu_send_cbs_to_online(&rcu_sched_state); - rcu_preempt_send_cbs_to_online(); + rcu_cleanup_dying_cpu(&rcu_bh_state); + rcu_cleanup_dying_cpu(&rcu_sched_state); + rcu_preempt_cleanup_dying_cpu(); rcu_cleanup_after_idle(cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: - rcu_offline_cpu(cpu); + rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); + rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); + rcu_preempt_cleanup_dead_cpu(cpu); break; default: break; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index af2af3cc5e65..05e03675439a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -439,8 +439,8 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); static int rcu_preempt_offline_tasks(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void rcu_preempt_offline_cpu(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_cleanup_dead_cpu(int cpu); static void rcu_preempt_check_callbacks(int cpu); static void rcu_preempt_process_callbacks(void); void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); @@ -451,7 +451,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, static int rcu_preempt_pending(int cpu); static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); -static void rcu_preempt_send_cbs_to_online(void); +static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7adf232bb66b..eeb2cc6b8657 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -618,16 +618,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, return retval; } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Do CPU-offline processing for preemptible RCU. */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu) { - __rcu_offline_cpu(cpu, &rcu_preempt_state); + rcu_cleanup_dead_cpu(cpu, &rcu_preempt_state); } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, @@ -912,11 +912,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Move preemptible RCU's callbacks from dying CPU to other online CPU. + * Move preemptible RCU's callbacks from dying CPU to other online CPU + * and record a quiescent state. */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void) { - rcu_send_cbs_to_online(&rcu_preempt_state); + rcu_cleanup_dying_cpu(&rcu_preempt_state); } /* @@ -1052,16 +1053,16 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, return 0; } +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + /* * Because preemptible RCU does not exist, it never needs CPU-offline * processing. */ -static void rcu_preempt_offline_cpu(int cpu) +static void rcu_preempt_cleanup_dead_cpu(int cpu) { } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, it never has any callbacks * to check. @@ -1153,9 +1154,9 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu) } /* - * Because there is no preemptible RCU, there are no callbacks to move. + * Because there is no preemptible RCU, there is no cleanup to do. */ -static void rcu_preempt_send_cbs_to_online(void) +static void rcu_preempt_cleanup_dying_cpu(void) { } -- cgit v1.2.3 From 091541bbdb9906349481a504e7d8e7fa89f6b6bb Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 12:51:14 -0800 Subject: rcu: Make rcutorture flag online/offline failures Make rcutorture check for CPU-hotplug failures and complain if there were any. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a58ac285fc69..a52b3217c160 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1602,6 +1602,10 @@ rcu_torture_cleanup(void) cur_ops->cleanup(); if (atomic_read(&n_rcu_torture_error)) rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); + else if (n_online_successes != n_online_attempts || + n_offline_successes != n_offline_attempts) + rcu_torture_print_module_parms(cur_ops, + "End of test: RCU_HOTPLUG"); else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } -- cgit v1.2.3 From 778d250a29224795e6320b58928bafa6b6104a06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 14:13:24 -0800 Subject: rcu: Limit lazy-callback duration Currently, a given CPU is permitted to remain in dyntick-idle mode indefinitely if it has only lazy RCU callbacks queued. This is vulnerable to corner cases in NUMA systems, so limit the time to six seconds by default. (Currently controlled by a cpp macro.) Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index eeb2cc6b8657..f8e6dcc91860 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2050,6 +2050,9 @@ static void rcu_prepare_for_idle(int cpu) * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your * system. And if you are -that- concerned about energy efficiency, * just power the system down and be done with it! + * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is + * permitted to sleep in dyntick-idle mode with only lazy RCU + * callbacks pending. Setting this too high can OOM your system. * * The values below work well in practice. If future workloads require * adjustment, they can be converted into kernel config parameters, though @@ -2058,11 +2061,13 @@ static void rcu_prepare_for_idle(int cpu) #define RCU_IDLE_FLUSHES 5 /* Number of dyntick-idle tries. */ #define RCU_IDLE_OPT_FLUSHES 3 /* Optional dyntick-idle tries. */ #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ +#define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ static DEFINE_PER_CPU(int, rcu_dyntick_drain); static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer); -static ktime_t rcu_idle_gp_wait; +static ktime_t rcu_idle_gp_wait; /* If some non-lazy callbacks. */ +static ktime_t rcu_idle_lazy_gp_wait; /* If only lazy callbacks. */ /* * Allow the CPU to enter dyntick-idle mode if either: (1) There are no @@ -2151,6 +2156,8 @@ static void rcu_prepare_for_idle_init(int cpu) unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY); rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000); + upj = jiffies_to_usecs(RCU_IDLE_LAZY_GP_DELAY); + rcu_idle_lazy_gp_wait = ns_to_ktime(upj * (u64)1000); firsttime = 0; } } @@ -2225,6 +2232,9 @@ static void rcu_prepare_for_idle(int cpu) if (rcu_cpu_has_nonlazy_callbacks(cpu)) hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), rcu_idle_gp_wait, HRTIMER_MODE_REL); + else + hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), + rcu_idle_lazy_gp_wait, HRTIMER_MODE_REL); return; /* Nothing more to do immediately. */ } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ -- cgit v1.2.3 From 8146c4e2e2c1972216afece5c50e072e86120e42 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 14:23:29 -0800 Subject: rcu: Check for callback invocation from offline CPUs Because quiescent states are now reported from offline CPUs in CPU_DYING state, there is some possibility that such a CPU might note the end of a grace period and attempt to start invoking callbacks. This would be a very bad thing, and is supposed to be prevented by the fact that the CPU_DYING CPU gets rid of all its callbacks before reporting the quiescent state. However, there is other CPU-offline code in the kernel, and it is quite possible that someone will invoke RCU core processing from that code. Therefore, this commit adds a warning for this case. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 575f91d03f06..ac3a810d2db7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1373,6 +1373,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) * races with call_rcu() from interrupt handlers. */ local_irq_save(flags); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); list = rdp->nxtlist; -- cgit v1.2.3 From a50c3af910e06f35bc0c68f89d8fef98c0fec0ea Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 10 Jan 2012 17:52:31 -0800 Subject: rcu: Don't make callbacks go through second full grace period RCU's current CPU-offline code path dumps all of the outgoing CPU's callbacks onto the RCU_NEXT_TAIL portion of the surviving CPU's callback list. This means that all the ready-to-invoke callbacks from the outgoing CPU must wait for another full RCU grace period. This was just fine when CPU-hotplug events were rare, but there is increasing evidence that users are planning to make increasing use of CPU hotplug. Therefore, this commit changes the callback-dumping procedure so that callbacks that are ready to invoke are moved to the RCU_DONE_TAIL portion of the surviving CPU's callback list. This avoids running these callbacks through a second unnecessary grace period. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 46 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ac3a810d2db7..7789e666394d 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1270,24 +1270,64 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ - /* Move callbacks to some other CPU. */ + /* First, adjust the counts. */ + if (rdp->nxtlist != NULL) { + receive_rdp->qlen_lazy += rdp->qlen_lazy; + receive_rdp->qlen += rdp->qlen; + rdp->qlen_lazy = 0; + rdp->qlen = 0; + } + + /* + * Next, move ready-to-invoke callbacks to be invoked on some + * other CPU. These will not be required to pass through another + * grace period: They are done, regardless of CPU. + */ + if (rdp->nxtlist != NULL && + rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) { + struct rcu_head *oldhead; + struct rcu_head **oldtail; + struct rcu_head **newtail; + + oldhead = rdp->nxtlist; + oldtail = receive_rdp->nxttail[RCU_DONE_TAIL]; + rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = *oldtail; + *receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead; + newtail = rdp->nxttail[RCU_DONE_TAIL]; + for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) { + if (receive_rdp->nxttail[i] == oldtail) + receive_rdp->nxttail[i] = newtail; + if (rdp->nxttail[i] == newtail) + rdp->nxttail[i] = &rdp->nxtlist; + } + } + + /* + * Finally, put the rest of the callbacks at the end of the list. + * The ones that made it partway through get to start over: We + * cannot assume that grace periods are synchronized across CPUs. + * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but + * this does not seem compelling. Not yet, anyway.) + */ if (rdp->nxtlist != NULL) { *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - receive_rdp->qlen_lazy += rdp->qlen_lazy; - receive_rdp->qlen += rdp->qlen; receive_rdp->n_cbs_adopted += rdp->qlen; rdp->n_cbs_orphaned += rdp->qlen; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; - rdp->qlen_lazy = 0; - rdp->qlen = 0; } - /* Record a quiescent state for the dying CPU. */ + /* + * Record a quiescent state for the dying CPU. This is safe + * only because we have already cleared out the callbacks. + * (Otherwise, the RCU core might try to schedule the invocation + * of callbacks on this now-offline CPU, which would be bad.) + */ mask = rdp->grpmask; /* rnp->grplo is constant. */ trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), -- cgit v1.2.3 From f38bd1020f797694b6b5e06f5f06c87688fc84c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 11:34:50 -0800 Subject: rcu: Remove single-rcu_node optimization in rcu_start_gp() The grace-period initialization sequence in rcu_start_gp() has a special case for systems where the rcu_node tree is a single rcu_node structure. This made sense some years ago when systems were smaller and up to 64 CPUs could share a single rcu_node structure, but now that large systems are common and a given leaf rcu_node structure can support only 16 CPUs (due to lock contention on the rcu_node's ->lock field), this optimization is almost never taken. And even the small mobile platforms that might make use of it might rather have the kernel text reduction. Therefore, this commit removes the check for single-rcu_node trees. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7789e666394d..49bb363a0837 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -984,26 +984,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; record_gp_stall_check_time(rsp); - - /* Special-case the common single-level case. */ - if (NUM_RCU_NODES == 1) { - rcu_preempt_check_blocked_tasks(rnp); - rnp->qsmask = rnp->qsmaskinit; - rnp->gpnum = rsp->gpnum; - rnp->completed = rsp->completed; - rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */ - rcu_start_gp_per_cpu(rsp, rnp, rdp); - rcu_preempt_boost_start_gp(rnp); - trace_rcu_grace_period_init(rsp->name, rnp->gpnum, - rnp->level, rnp->grplo, - rnp->grphi, rnp->qsmask); - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ - /* Exclude any concurrent CPU-hotplug operations. */ raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ -- cgit v1.2.3 From 26861faf896a4cfdc4243281e5c305755f4bad52 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 14:40:20 -0800 Subject: rcu: Protect __rcu_read_unlock() against scheduler-using irq handlers This commit ports commit #10f39bb1b2 (rcu: protect __rcu_read_unlock() against scheduler-using irq handlers) from TREE_PREEMPT_RCU to TINY_PREEMPT_RCU. The following is a corresponding port of that commit message. The addition of RCU read-side critical sections within runqueue and priority-inheritance critical sections introduced some deadlocks, for example, involving interrupts from __rcu_read_unlock() where the interrupt handlers call wake_up(). This situation can cause the instance of __rcu_read_unlock() invoked from interrupt to do some of the processing that would otherwise have been carried out by the task-level instance of __rcu_read_unlock(). When the interrupt-level instance of __rcu_read_unlock() is called with a scheduler lock held from interrupt-entry/exit situations where in_irq() returns false, deadlock can result. Of course, in a UP kernel, there are not really any deadlocks, but the upper-level critical section can still be be fatally confused by the lower-level critical section changing things out from under it. This commit resolves these deadlocks by using negative values of the per-task ->rcu_read_lock_nesting counter to indicate that an instance of __rcu_read_unlock() is in flight, which in turn prevents instances from interrupt handlers from doing any special processing. Note that nested rcu_read_lock()/rcu_read_unlock() pairs are still permitted, but they will never see ->rcu_read_lock_nesting go to zero, and will therefore never invoke rcu_read_unlock_special(), thus preventing them from seeing the RCU_READ_UNLOCK_BLOCKED bit should it be set in ->rcu_read_unlock_special. This patch also adds a check for ->rcu_read_unlock_special being negative in rcu_check_callbacks(), thus preventing the RCU_READ_UNLOCK_NEED_QS bit from being set should a scheduling-clock interrupt occur while __rcu_read_unlock() is exiting from an outermost RCU read-side critical section. Of course, __rcu_read_unlock() can be preempted during the time that ->rcu_read_lock_nesting is negative. This could result in the setting of the RCU_READ_UNLOCK_BLOCKED bit after __rcu_read_unlock() checks it, and would also result it this task being queued on the corresponding rcu_node structure's blkd_tasks list. Therefore, some later RCU read-side critical section would enter rcu_read_unlock_special() to clean up -- which could result in deadlock (OK, OK, fatal confusion) if that RCU read-side critical section happened to be in the scheduler where the runqueue or priority-inheritance locks were held. To prevent the possibility of fatal confusion that might result from preemption during the time that ->rcu_read_lock_nesting is negative, this commit also makes rcu_preempt_note_context_switch() check for negative ->rcu_read_lock_nesting, thus refraining from queuing the task (and from setting RCU_READ_UNLOCK_BLOCKED) if we are already exiting from the outermost RCU read-side critical section (in other words, we really are no longer actually in that RCU read-side critical section). In addition, rcu_preempt_note_context_switch() invokes rcu_read_unlock_special() to carry out the cleanup in this case, which clears out the ->rcu_read_unlock_special bits and dequeues the task (if necessary), in turn avoiding needless delay of the current RCU grace period and needless RCU priority boosting. It is still illegal to call rcu_read_unlock() while holding a scheduler lock if the prior RCU read-side critical section has ever had both preemption and irqs enabled. However, the common use case is legal, namely where then entire RCU read-side critical section executes with irqs disabled, for example, when the scheduler lock is held across the entire lifetime of the RCU read-side critical section. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 4b905404a5bd..432ed2bc05ad 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -132,6 +132,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { RCU_TRACE(.rcb.name = "rcu_preempt") }; +static void rcu_read_unlock_special(struct task_struct *t); static int rcu_preempted_readers_exp(void); static void rcu_report_exp_done(void); @@ -146,6 +147,16 @@ static int rcu_cpu_blocking_cur_gp(void) /* * Check for a running RCU reader. Because there is only one CPU, * there can be but one running RCU reader at a time. ;-) + * + * Returns zero if there are no running readers. Returns a positive + * number if there is at least one reader within its RCU read-side + * critical section. Returns a negative number if an outermost reader + * is in the midst of exiting from its RCU read-side critical section + * + * Returns zero if there are no running readers. Returns a positive + * number if there is at least one reader within its RCU read-side + * critical section. Returns a negative number if an outermost reader + * is in the midst of exiting from its RCU read-side critical section. */ static int rcu_preempt_running_reader(void) { @@ -475,7 +486,7 @@ void rcu_preempt_note_context_switch(void) unsigned long flags; local_irq_save(flags); /* must exclude scheduler_tick(). */ - if (rcu_preempt_running_reader() && + if (rcu_preempt_running_reader() > 0 && (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ @@ -494,6 +505,13 @@ void rcu_preempt_note_context_switch(void) list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); if (rcu_cpu_blocking_cur_gp()) rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } else if (rcu_preempt_running_reader() < 0 && + t->rcu_read_unlock_special) { + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); } /* @@ -618,13 +636,22 @@ void __rcu_read_unlock(void) struct task_struct *t = current; barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ - --t->rcu_read_lock_nesting; - barrier(); /* decrement before load of ->rcu_read_unlock_special */ - if (t->rcu_read_lock_nesting == 0 && - unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) - rcu_read_unlock_special(t); + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } #ifdef CONFIG_PROVE_LOCKING - WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } #endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); @@ -649,7 +676,7 @@ static void rcu_preempt_check_callbacks(void) invoke_rcu_callbacks(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && - rcu_preempt_running_reader()) + rcu_preempt_running_reader() > 0) t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; } -- cgit v1.2.3 From afef20540f7cd1ea91bc1ac20be238389eee4003 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 15:30:36 -0800 Subject: rcu: Streamline code produced by __rcu_read_unlock() This is a port of commit #be0e1e21 to TINY_PREEMPT_RCU. This uses noinline to prevent rcu_read_unlock_special() from being inlined into __rcu_read_unlock(). Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 432ed2bc05ad..b58a3200f0ff 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -544,7 +544,7 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock); * notify RCU core processing or task having blocked during the RCU * read-side critical section. */ -static void rcu_read_unlock_special(struct task_struct *t) +static noinline void rcu_read_unlock_special(struct task_struct *t) { int empty; int empty_exp; -- cgit v1.2.3 From 768dfffdffbfcc07d6927bdd642c714c0dd64c99 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 16:33:17 -0800 Subject: rcu: Prevent RCU callbacks from executing before scheduler initialized This is a port of commit #b0d3041 from TREE_RCU to TREE_PREEMPT_RCU. Under some rare but real combinations of configuration parameters, RCU callbacks are posted during early boot that use kernel facilities that are not yet initialized. Therefore, when these callbacks are invoked, hard hangs and crashes ensue. This commit therefore prevents RCU callbacks from being invoked until after the scheduler is fully up and running, as in after multiple tasks have been spawned. It might well turn out that a better approach is to identify the specific RCU callbacks that are causing this problem, but that discussion will wait until such time as someone really needs an RCU callback to be invoked (as opposed to merely registered) during early boot. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcutiny.h | 4 ---- kernel/rcutiny_plugin.h | 15 ++++++++++++--- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 51bf29c81485..e93df77176d1 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,13 +27,9 @@ #include -#ifdef CONFIG_RCU_BOOST static inline void rcu_init(void) { } -#else /* #ifdef CONFIG_RCU_BOOST */ -void rcu_init(void); -#endif /* #else #ifdef CONFIG_RCU_BOOST */ static inline void rcu_barrier_bh(void) { diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index b58a3200f0ff..95df60ebe363 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -914,7 +914,8 @@ static void rcu_preempt_process_callbacks(void) static void invoke_rcu_callbacks(void) { have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); + if (rcu_kthread_task != NULL) + wake_up(&rcu_kthread_wq); } #ifdef CONFIG_RCU_TRACE @@ -975,12 +976,16 @@ early_initcall(rcu_spawn_kthreads); #else /* #ifdef CONFIG_RCU_BOOST */ +/* Hold off callback invocation until early_initcall() time. */ +static int rcu_scheduler_fully_active __read_mostly; + /* * Start up softirq processing of callbacks. */ void invoke_rcu_callbacks(void) { - raise_softirq(RCU_SOFTIRQ); + if (rcu_scheduler_fully_active) + raise_softirq(RCU_SOFTIRQ); } #ifdef CONFIG_RCU_TRACE @@ -995,10 +1000,14 @@ static bool rcu_is_callbacks_kthread(void) #endif /* #ifdef CONFIG_RCU_TRACE */ -void rcu_init(void) +static int __init rcu_scheduler_really_started(void) { + rcu_scheduler_fully_active = 1; open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + raise_softirq(RCU_SOFTIRQ); /* Invoke any callbacks from early boot. */ + return 0; } +early_initcall(rcu_scheduler_really_started); #endif /* #else #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3 From 8762705ad4ac860bb78434409df463d02ac8f027 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 16:59:01 -0800 Subject: rcu: Inform RCU of irq_exit() activity This is a port to TINY_RCU of Peter Zijlstra's commit #ec433f0c5 The rcu_read_unlock_special() function relies on in_irq() to exclude scheduler activity from interrupt level. This fails because exit_irq() can invoke the scheduler after clearing the preempt_count() bits that in_irq() uses to determine that it is at interrupt level. This situation can result in failures as follows: $task IRQ SoftIRQ rcu_read_lock() /* do stuff */ |= UNLOCK_BLOCKED rcu_read_unlock() --t->rcu_read_lock_nesting irq_enter(); /* do stuff, don't use RCU */ irq_exit(); sub_preempt_count(IRQ_EXIT_OFFSET); invoke_softirq() ttwu(); spin_lock_irq(&pi->lock) rcu_read_lock(); /* do stuff */ rcu_read_unlock(); rcu_read_unlock_special() rcu_report_exp_rnp() ttwu() spin_lock_irq(&pi->lock) /* deadlock */ rcu_read_unlock_special(t); This can be triggered 'easily' because invoke_softirq() immediately does a ttwu() of ksoftirqd/# instead of doing the in-place softirq stuff first, but even without that the above happens. Cure this by also excluding softirqs from the rcu_read_unlock_special() handler and ensuring the force_irqthreads ksoftirqd/# wakeup is done from full softirq context. It is also necessary to delay the ->rcu_read_lock_nesting decrement until after rcu_read_unlock_special(). This delay is handled by the commit "Protect __rcu_read_unlock() against scheduler-using irq handlers". Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutiny_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 95df60ebe363..387c2759e1b0 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -570,7 +570,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) rcu_preempt_cpu_qs(); /* Hardware IRQ handlers cannot block. */ - if (in_irq()) { + if (in_irq() || in_serving_softirq()) { local_irq_restore(flags); return; } -- cgit v1.2.3 From 1aa03f1188f7b0b85df2de602b33ee7b6fab8e00 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Jan 2012 17:25:17 -0800 Subject: rcu: Simplify unboosting checks This is a port of commit #82e78d80 from TREE_PREEMPT_RCU to TINY_PREEMPT_RCU. This commit uses the fact that current->rcu_boost_mutex is set any time that the RCU_READ_UNLOCK_BOOSTED flag is set in the current->rcu_read_unlock_special bitmask. This allows tests of the bit to be changed to tests of the pointer, which in turn allows the RCU_READ_UNLOCK_BOOSTED flag to be eliminated. Please note that the check of current->rcu_read_unlock_special need not change because any time that RCU_READ_UNLOCK_BOOSTED was set, so was RCU_READ_UNLOCK_BLOCKED. Therefore, __rcu_read_unlock() can continue testing current->rcu_read_unlock_special for non-zero, as before. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/sched.h | 3 +-- kernel/rcutiny_plugin.h | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d379a6bfd88..e692abaf915a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1864,8 +1864,7 @@ extern void task_clear_jobctl_pending(struct task_struct *task, #ifdef CONFIG_PREEMPT_RCU #define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_BOOSTED (1 << 1) /* boosted while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 2) /* RCU core needs CPU response. */ +#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ static inline void rcu_copy_process(struct task_struct *p) { diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 387c2759e1b0..22ecea0dfb62 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -318,7 +318,6 @@ static int rcu_boost(void) t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); t->rcu_boost_mutex = &mtx; - t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; raw_local_irq_restore(flags); rt_mutex_lock(&mtx); rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -550,6 +549,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) int empty_exp; unsigned long flags; struct list_head *np; +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ int special; /* @@ -615,10 +617,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) } #ifdef CONFIG_RCU_BOOST /* Unboost self if was boosted. */ - if (special & RCU_READ_UNLOCK_BOOSTED) { - t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; - rt_mutex_unlock(t->rcu_boost_mutex); + if (t->rcu_boost_mutex != NULL) { + rbmp = t->rcu_boost_mutex; t->rcu_boost_mutex = NULL; + rt_mutex_unlock(rbmp); } #endif /* #ifdef CONFIG_RCU_BOOST */ local_irq_restore(flags); -- cgit v1.2.3 From 30fbcc90b02187c55c57ff0ecf57cecbd487d694 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 11:01:14 -0800 Subject: rcu: Clean up straggling rcu_preempt_needs_cpu() name The recent updates to RCU_CPU_FAST_NO_HZ have an rcu_needs_cpu() that does more than just check for callbacks, so get the name for rcu_preempt_needs_cpu() consistent with that change, now calling it rcu_preempt_cpu_has_callbacks(). Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- kernel/rcutree.h | 2 +- kernel/rcutree_plugin.h | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 49bb363a0837..0569ba11e35a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1958,7 +1958,7 @@ static int rcu_cpu_has_callbacks(int cpu) /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || per_cpu(rcu_bh_data, cpu).nxtlist || - rcu_preempt_needs_cpu(cpu); + rcu_preempt_cpu_has_callbacks(cpu); } static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 05e03675439a..58c9fc3bc820 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -449,7 +449,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake); #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ static int rcu_preempt_pending(int cpu); -static int rcu_preempt_needs_cpu(int cpu); +static int rcu_preempt_cpu_has_callbacks(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_cleanup_dying_cpu(void); static void __init __rcu_init_preempt(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f8e6dcc91860..297d561c3e4b 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -887,9 +887,9 @@ static int rcu_preempt_pending(int cpu) } /* - * Does preemptible RCU need the CPU to stay out of dynticks mode? + * Does preemptible RCU have callbacks on this CPU? */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu) { return !!per_cpu(rcu_preempt_data, cpu).nxtlist; } @@ -1128,9 +1128,9 @@ static int rcu_preempt_pending(int cpu) } /* - * Because preemptible RCU does not exist, it never needs any CPU. + * Because preemptible RCU does not exist, it never has callbacks */ -static int rcu_preempt_needs_cpu(int cpu) +static int rcu_preempt_cpu_has_callbacks(int cpu) { return 0; } -- cgit v1.2.3 From c44e2cddacc2cf299186bad5697d738ea19668b7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 13:08:18 -0800 Subject: rcu: Check for idle-loop entry while in RCU read-side critical section The inner idle loop is an extended quiescent state for all flavors of RCU, but there have been recent bug involving use of RCU read-side primitives from within the idle loop. Therefore, this commit enlists lockdep-RCU to detect attempts to enter the inner idle loop while in an RCU read-side critical section, emitting a lockdep-RCU splat if so. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 0569ba11e35a..195bf1fb59ea 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -366,6 +366,17 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) atomic_inc(&rdtp->dynticks); smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + + /* + * The idle task is not permitted to enter the idle loop while + * in an RCU read-side critical section. + */ + rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /** -- cgit v1.2.3 From 27565d64a4e564e72c22d8c91a3cfcb9442383e8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Jan 2012 19:35:08 -0800 Subject: rcu: Remove #ifdef CONFIG_SMP from TREE_RCU Now that both TINY_RCU and TINY_PREEMPT_RCU have been in place for awhile, it is time to remove UP support from TREE_RCU, which is what this commit does. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 19 ------------------- kernel/rcutree_plugin.h | 12 ------------ 2 files changed, 31 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 195bf1fb59ea..61adb351d2c7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -301,8 +301,6 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) return &rsp->node[0]; } -#ifdef CONFIG_SMP - /* * If the specified CPU is offline, tell the caller that it is in * a quiescent state. Otherwise, whack it with a reschedule IPI. @@ -339,8 +337,6 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) return 0; } -#endif /* #ifdef CONFIG_SMP */ - /* * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle * @@ -606,8 +602,6 @@ int rcu_is_cpu_rrupt_from_idle(void) return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; } -#ifdef CONFIG_SMP - /* * Snapshot the specified CPU's dynticks counter so that we can later * credit them with an implicit quiescent state. Return 1 if this CPU @@ -651,8 +645,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return rcu_implicit_offline_qs(rdp); } -#endif /* #ifdef CONFIG_SMP */ - static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; @@ -1517,8 +1509,6 @@ void rcu_check_callbacks(int cpu, int user) trace_rcu_utilization("End scheduler-tick"); } -#ifdef CONFIG_SMP - /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. @@ -1641,15 +1631,6 @@ unlock_fqs_ret: trace_rcu_utilization("End fqs"); } -#else /* #ifdef CONFIG_SMP */ - -static void force_quiescent_state(struct rcu_state *rsp, int relaxed) -{ - set_need_resched(); -} - -#endif /* #else #ifdef CONFIG_SMP */ - /* * This does the RCU core processing work for the specified rcu_state * and rcu_data structures. This may be called only from the CPU to diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 297d561c3e4b..0e74e1c6333f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1858,16 +1858,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#ifndef CONFIG_SMP - -void synchronize_sched_expedited(void) -{ - cond_resched(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - -#else /* #ifndef CONFIG_SMP */ - static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); @@ -1982,8 +1972,6 @@ void synchronize_sched_expedited(void) } EXPORT_SYMBOL_GPL(synchronize_sched_expedited); -#endif /* #else #ifndef CONFIG_SMP */ - #if !defined(CONFIG_RCU_FAST_NO_HZ) /* -- cgit v1.2.3 From 13cfcca0e4e2d4cee1d0183c049eb34e54ac976e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Jan 2012 15:32:18 -0800 Subject: rcu: Set RCU CPU stall times via sysfs The default CONFIG_RCU_CPU_STALL_TIMEOUT value of 60 seconds has served Linux users well for production use for quite some time. However, for debugging, there will be more than three minutes between subsequent stall-warning messages. This can be an annoyingly long wait if you are trying to work out where the offending infinite loop is hiding. Therefore, this commit provides a rcu_cpu_stall_timeout sysfs parameter that may be adjusted at boot time and at runtime to speed up debugging. Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 31 ++++++++++++++++++++++++++----- kernel/rcutree.h | 6 ------ 2 files changed, 26 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 61adb351d2c7..cfdab9898e33 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -208,8 +208,11 @@ module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); -int rcu_cpu_stall_suppress __read_mostly; +int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ +int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; + module_param(rcu_cpu_stall_suppress, int, 0644); +module_param(rcu_cpu_stall_timeout, int, 0644); static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -645,10 +648,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return rcu_implicit_offline_qs(rdp); } +static int jiffies_till_stall_check(void) +{ + int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); + + /* + * Limit check must be consistent with the Kconfig limits + * for CONFIG_RCU_CPU_STALL_TIMEOUT. + */ + if (till_stall_check < 3) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; + till_stall_check = 3; + } else if (till_stall_check > 300) { + ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; + till_stall_check = 300; + } + return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; +} + static void record_gp_stall_check_time(struct rcu_state *rsp) { rsp->gp_start = jiffies; - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; + rsp->jiffies_stall = jiffies + jiffies_till_stall_check(); } static void print_other_cpu_stall(struct rcu_state *rsp) @@ -667,7 +688,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; /* * Now rat on any tasks that got kicked up to the root rcu_node @@ -726,8 +747,8 @@ static void print_cpu_stall(struct rcu_state *rsp) raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) - rsp->jiffies_stall = - jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + rsp->jiffies_stall = jiffies + + 3 * jiffies_till_stall_check() + 3; raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 58c9fc3bc820..0328a537846a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -314,12 +314,6 @@ struct rcu_data { #else #define RCU_STALL_DELAY_DELTA 0 #endif - -#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ - RCU_STALL_DELAY_DELTA) - /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) - /* for rsp->jiffies_stall */ #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ /* to take at least one */ /* scheduling clock irq */ -- cgit v1.2.3 From a858af2875fb291d0f4b0a4419fefbf03c2379c0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Jan 2012 13:29:10 -0800 Subject: rcu: Print scheduling-clock information on RCU CPU stall-warning messages There have been situations where RCU CPU stall warnings were caused by issues in scheduling-clock timer initialization. To make it easier to track these down, this commit causes the RCU CPU stall-warning messages to print out the number of scheduling-clock interrupts taken in the current grace period for each stalled CPU. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 33 +++++++---- kernel/rcutree.h | 11 ++++ kernel/rcutree_plugin.h | 150 +++++++++++++++++++++++++++++++++++++++++++++++- lib/Kconfig.debug | 14 +++++ 4 files changed, 194 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index cfdab9898e33..dccd2f78db4e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -689,12 +689,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) return; } rsp->jiffies_stall = jiffies + 3 * jiffies_till_stall_check() + 3; - - /* - * Now rat on any tasks that got kicked up to the root rcu_node - * due to CPU offlining. - */ - ndetected = rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -702,8 +696,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", + printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks:", rsp->name); + print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); ndetected += rcu_print_task_stall(rnp); @@ -712,11 +707,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) { - printk(" %d", rnp->grplo + cpu); + print_cpu_stall_info(rsp, rnp->grplo + cpu); ndetected++; } } - printk("} (detected by %d, t=%ld jiffies)\n", + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rnp = rcu_get_root(rsp); + raw_spin_lock_irqsave(&rnp->lock, flags); + ndetected = rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + print_cpu_stall_info_end(); + printk(KERN_CONT "(detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); if (ndetected == 0) printk(KERN_ERR "INFO: Stall ended before state dump start\n"); @@ -740,8 +746,11 @@ static void print_cpu_stall(struct rcu_state *rsp) * See Documentation/RCU/stallwarn.txt for info on how to debug * RCU CPU stall warnings. */ - printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", - rsp->name, smp_processor_id(), jiffies - rsp->gp_start); + printk(KERN_ERR "INFO: %s self-detected stall on CPU", rsp->name); + print_cpu_stall_info_begin(); + print_cpu_stall_info(rsp, smp_processor_id()); + print_cpu_stall_info_end(); + printk(KERN_CONT " (t=%lu jiffies)\n", jiffies - rsp->gp_start); if (!trigger_all_cpu_backtrace()) dump_stack(); @@ -831,6 +840,7 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; + zero_cpu_stall_ticks(rdp); } } @@ -1496,6 +1506,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) void rcu_check_callbacks(int cpu, int user) { trace_rcu_utilization("Start scheduler-tick"); + increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { /* diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 0328a537846a..e2ac8ee415bb 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -239,6 +239,12 @@ struct rcu_data { bool preemptible; /* Preemptible RCU? */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + unsigned long ticks_this_gp; /* The number of scheduling-clock */ + /* ticks this CPU has handled */ + /* during and after the last grace */ + /* period it is aware of. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ /* 2) batch handling */ /* @@ -466,5 +472,10 @@ static void __cpuinit rcu_prepare_kthreads(int cpu); static void rcu_prepare_for_idle_init(int cpu); static void rcu_cleanup_after_idle(int cpu); static void rcu_prepare_for_idle(int cpu); +static void print_cpu_stall_info_begin(void); +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); +static void print_cpu_stall_info_end(void); +static void zero_cpu_stall_ticks(struct rcu_data *rdp); +static void increment_cpu_stall_ticks(void); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 0e74e1c6333f..aa93b074bb2f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -63,7 +63,10 @@ static void __init rcu_bootup_announce_oddness(void) printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); #endif #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) - printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); + printk(KERN_INFO "\tDump stacks of tasks blocking RCU-preempt GP.\n"); +#endif +#if defined(CONFIG_RCU_CPU_STALL_INFO) + printk(KERN_INFO "\tAdditional per-CPU info printed with stalls.\n"); #endif #if NUM_RCU_LVL_4 != 0 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); @@ -490,6 +493,31 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ +#ifdef CONFIG_RCU_CPU_STALL_INFO + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ + printk(KERN_ERR "\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", + rnp->level, rnp->grplo, rnp->grphi); +} + +static void rcu_print_task_stall_end(void) +{ + printk(KERN_CONT "\n"); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void rcu_print_task_stall_begin(struct rcu_node *rnp) +{ +} + +static void rcu_print_task_stall_end(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -501,12 +529,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; + rcu_print_task_stall_begin(rnp); t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { - printk(" P%d", t->pid); + printk(KERN_CONT " P%d", t->pid); ndetected++; } + rcu_print_task_stall_end(); return ndetected; } @@ -2004,7 +2034,7 @@ static void rcu_cleanup_after_idle(int cpu) } /* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y, + * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, * is nothing. */ static void rcu_prepare_for_idle(int cpu) @@ -2273,3 +2303,117 @@ static void rcu_prepare_for_idle(int cpu) } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#ifdef CONFIG_RCU_CPU_STALL_INFO + +#ifdef CONFIG_RCU_FAST_NO_HZ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ + struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu); + + sprintf(cp, "drain=%d %c timer=%lld", + per_cpu(rcu_dyntick_drain, cpu), + per_cpu(rcu_dyntick_holdoff, cpu) == jiffies ? 'H' : '.', + hrtimer_active(hrtp) + ? ktime_to_us(hrtimer_get_remaining(hrtp)) + : -1); +} + +#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + +static void print_cpu_stall_fast_no_hz(char *cp, int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ + +/* Initiate the stall-info list. */ +static void print_cpu_stall_info_begin(void) +{ + printk(KERN_CONT "\n"); +} + +/* + * Print out diagnostic information for the specified stalled CPU. + * + * If the specified CPU is aware of the current RCU grace period + * (flavor specified by rsp), then print the number of scheduling + * clock interrupts the CPU has taken during the time that it has + * been aware. Otherwise, print the number of RCU grace periods + * that this CPU is ignorant of, for example, "1" if the CPU was + * aware of the previous grace period. + * + * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + */ +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + char fast_no_hz[72]; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = rdp->dynticks; + char *ticks_title; + unsigned long ticks_value; + + if (rsp->gpnum == rdp->gpnum) { + ticks_title = "ticks this GP"; + ticks_value = rdp->ticks_this_gp; + } else { + ticks_title = "GPs behind"; + ticks_value = rsp->gpnum - rdp->gpnum; + } + print_cpu_stall_fast_no_hz(fast_no_hz, cpu); + printk(KERN_ERR "\t%d: (%lu %s) idle=%03x/%llx/%d %s\n", + cpu, ticks_value, ticks_title, + atomic_read(&rdtp->dynticks) & 0xfff, + rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, + fast_no_hz); +} + +/* Terminate the stall-info list. */ +static void print_cpu_stall_info_end(void) +{ + printk(KERN_ERR "\t"); +} + +/* Zero ->ticks_this_gp for all flavors of RCU. */ +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ + rdp->ticks_this_gp = 0; +} + +/* Increment ->ticks_this_gp for all flavors of RCU. */ +static void increment_cpu_stall_ticks(void) +{ + __get_cpu_var(rcu_sched_data).ticks_this_gp++; + __get_cpu_var(rcu_bh_data).ticks_this_gp++; +#ifdef CONFIG_TREE_PREEMPT_RCU + __get_cpu_var(rcu_preempt_data).ticks_this_gp++; +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + +static void print_cpu_stall_info_begin(void) +{ + printk(KERN_CONT " {"); +} + +static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) +{ + printk(KERN_CONT " %d", cpu); +} + +static void print_cpu_stall_info_end(void) +{ + printk(KERN_CONT "} "); +} + +static void zero_cpu_stall_ticks(struct rcu_data *rdp) +{ +} + +static void increment_cpu_stall_ticks(void) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3cc419d05f2f..d27a2aa3e815 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -927,6 +927,20 @@ config RCU_CPU_STALL_VERBOSE Say Y if you want to enable such checks. +config RCU_CPU_STALL_INFO + bool "Print additional diagnostics on RCU CPU stall" + depends on (TREE_RCU || TREE_PREEMPT_RCU) && DEBUG_KERNEL + default n + help + For each stalled CPU that is aware of the current RCU grace + period, print out additional per-CPU diagnostic information + regarding scheduling-clock ticks, idle state, and, + for RCU_FAST_NO_HZ kernels, idle-entry state. + + Say N if you are unsure. + + Say Y if you want to enable such diagnostics. + config RCU_TRACE bool "Enable tracing for RCU" depends on DEBUG_KERNEL -- cgit v1.2.3 From 9b9ec9b90ef481e182927989963274dc2697f9a6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Jan 2012 14:36:51 -0800 Subject: rcutorture: Permit holding off CPU-hotplug operations during boot When rcutorture is started automatically at boot time, it might well also start CPU-hotplug operations at that time, which might not be desirable. This commit therefore adds an rcutorture parameter that allows CPU-hotplug operations to be held off for the specified number of seconds after the start of boot. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 13 +++++++++++-- kernel/rcutorture.c | 12 ++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index d67068d0d2b9..01a809b1dbc6 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -69,6 +69,13 @@ onoff_interval CPU-hotplug operations regardless of what value is specified for onoff_interval. +onoff_holdoff The number of seconds to wait until starting CPU-hotplug + operations. This would normally only be used when + rcutorture was built into the kernel and started + automatically at boot time, in which case it is useful + in order to avoid confusing boot-time code with CPUs + coming and going. + shuffle_interval The number of seconds to keep the test threads affinitied to a particular subset of the CPUs, defaults to 3 seconds. @@ -277,5 +284,7 @@ The following script may be used to torture RCU: The output can be manually inspected for the error flag of "!!!". One could of course create a more elaborate script that automatically -checked for such errors. The "rmmod" command forces a "SUCCESS" or -"FAILURE" indication to be printk()ed. +checked for such errors. The "rmmod" command forces a "SUCCESS", +"FAILURE", or "RCU_HOTPLUG" indication to be printk()ed. The first +two are self-explanatory, while the last indicates that while there +were no RCU failures, CPU-hotplug problems were detected. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index a52b3217c160..2914cafe171b 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -65,6 +65,7 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ +static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ @@ -95,6 +96,8 @@ module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); module_param(onoff_interval, int, 0444); MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +module_param(onoff_holdoff, int, 0444); +MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); module_param(shutdown_secs, int, 0444); MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); module_param(test_boost, int, 0444); @@ -1300,13 +1303,13 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " "test_boost=%d/%d test_boost_interval=%d " "test_boost_duration=%d shutdown_secs=%d " - "onoff_interval=%d\n", + "onoff_interval=%d onoff_holdoff=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, test_boost, cur_ops->can_boost, test_boost_interval, test_boost_duration, shutdown_secs, - onoff_interval); + onoff_interval, onoff_holdoff); } static struct notifier_block rcutorture_shutdown_nb = { @@ -1410,6 +1413,11 @@ rcu_torture_onoff(void *arg) for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + if (onoff_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); + schedule_timeout_interruptible(onoff_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); + } while (!kthread_should_stop()) { cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { -- cgit v1.2.3 From c13f3757d0fcdcc2b7fc5d5e38da76b8913e6648 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 20 Jan 2012 15:36:33 -0800 Subject: rcu: Add CPU-stall capability to rcutorture Add module parameters to rcutorture that induce a CPU stall. The stall_cpu parameter specifies how long to stall in seconds, defaulting to zero, which indicates no stalling is to be undertaken. The stall_cpu_holdoff parameter specifies how many seconds after insmod (or boot, if rcutorture is built into the kernel) that this stall is to start. The default value for stall_cpu_holdoff is ten seconds. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/torture.txt | 18 ++++++++++++ kernel/rcutorture.c | 66 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) (limited to 'kernel') diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index d25be872ae33..375d3fb71437 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -86,6 +86,24 @@ shutdown_secs The number of seconds to run the test before terminating zero, which disables test termination and system shutdown. This capability is useful for automated testing. +stall_cpu The number of seconds that a CPU should be stalled while + within both an rcu_read_lock() and a preempt_disable(). + This stall happens only once per rcutorture run. + If you need multiple stalls, use modprobe and rmmod to + repeatedly run rcutorture. The default for stall_cpu + is zero, which prevents rcutorture from stalling a CPU. + + Note that attempts to rmmod rcutorture while the stall + is ongoing will hang, so be careful what value you + choose for this module parameter! In addition, too-large + values for stall_cpu might well induce failures and + warnings in other parts of the kernel. You have been + warned! + +stall_cpu_holdoff + The number of seconds to wait after rcutorture starts + before stalling a CPU. Defaults to 10 seconds. + stat_interval The number of seconds between output of torture statistics (via printk()). Regardless of the interval, statistics are printed when the module is unloaded. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 2914cafe171b..5cfa23be43bd 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -67,6 +67,8 @@ static int fqs_stutter = 3; /* Wait time between bursts (s). */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ +static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ +static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ @@ -100,6 +102,10 @@ module_param(onoff_holdoff, int, 0444); MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); module_param(shutdown_secs, int, 0444); MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); +module_param(stall_cpu, int, 0444); +MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +module_param(stall_cpu_holdoff, int, 0444); +MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); module_param(test_boost_interval, int, 0444); @@ -132,6 +138,7 @@ static struct task_struct *shutdown_task; #ifdef CONFIG_HOTPLUG_CPU static struct task_struct *onoff_task; #endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static struct task_struct *stall_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -1489,6 +1496,63 @@ static void rcu_torture_onoff_cleanup(void) #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ +/* + * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then + * induces a CPU stall for the time specified by stall_cpu. + */ +static int __cpuinit rcu_torture_stall(void *args) +{ + unsigned long stop_at; + + VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); + if (stall_cpu_holdoff > 0) { + VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); + schedule_timeout_interruptible(stall_cpu_holdoff * HZ); + VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); + } + if (!kthread_should_stop()) { + stop_at = get_seconds() + stall_cpu; + /* RCU CPU stall is expected behavior in following code. */ + printk(KERN_ALERT "rcu_torture_stall start.\n"); + rcu_read_lock(); + preempt_disable(); + while (ULONG_CMP_LT(get_seconds(), stop_at)) + continue; /* Induce RCU CPU stall warning. */ + preempt_enable(); + rcu_read_unlock(); + printk(KERN_ALERT "rcu_torture_stall end.\n"); + } + rcutorture_shutdown_absorb("rcu_torture_stall"); + while (!kthread_should_stop()) + schedule_timeout_interruptible(10 * HZ); + return 0; +} + +/* Spawn CPU-stall kthread, if stall_cpu specified. */ +static int __init rcu_torture_stall_init(void) +{ + int ret; + + if (stall_cpu <= 0) + return 0; + stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); + if (IS_ERR(stall_task)) { + ret = PTR_ERR(stall_task); + stall_task = NULL; + return ret; + } + return 0; +} + +/* Clean up after the CPU-stall kthread, if one was spawned. */ +static void rcu_torture_stall_cleanup(void) +{ + if (stall_task == NULL) + return; + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); + kthread_stop(stall_task); +} + static int rcutorture_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1531,6 +1595,7 @@ rcu_torture_cleanup(void) fullstop = FULLSTOP_RMMOD; mutex_unlock(&fullstop_mutex); unregister_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_stall_cleanup(); if (stutter_task) { VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); kthread_stop(stutter_task); @@ -1831,6 +1896,7 @@ rcu_torture_init(void) } rcu_torture_onoff_init(); register_reboot_notifier(&rcutorture_shutdown_nb); + rcu_torture_stall_init(); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; -- cgit v1.2.3 From c0d6d01bffdce19fa19baad6cb8cc3eed7bfd6f5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 12:41:26 -0800 Subject: rcu: Check for illegal use of RCU from offlined CPUs Although it is legal to use RCU during early boot, it is anything but legal to use RCU at runtime from an offlined CPU. After all, RCU explicitly ignores offlined CPUs. This commit therefore adds checks for runtime use of RCU from offlined CPUs. These checks are not perfect, in particular, they can be subverted through use of things like rcu_dereference_raw(). Note that it is not possible to put checks in rcu_read_lock() and friends due to the fact that these primitives are used in code that might be used under either RCU or lock-based protection, which means that checking rcu_read_lock() gets you fat piles of false positives. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 19 +++++++++++++++++++ include/linux/srcu.h | 11 +++++++---- kernel/rcupdate.c | 5 +++++ kernel/rcutree.c | 29 +++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 1 + 5 files changed, 61 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index f409529ff35a..146d37d31778 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -226,6 +226,15 @@ static inline void destroy_rcu_head_on_stack(struct rcu_head *head) } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) +bool rcu_lockdep_current_cpu_online(void); +#else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ +static inline bool rcu_lockdep_current_cpu_online(void) +{ + return 1; +} +#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #ifdef CONFIG_PROVE_RCU @@ -270,6 +279,9 @@ extern int debug_lockdep_rcu_enabled(void); * occur in the same context, for example, it is illegal to invoke * rcu_read_unlock() in process context if the matching rcu_read_lock() * was invoked from within an irq handler. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. */ static inline int rcu_read_lock_held(void) { @@ -277,6 +289,8 @@ static inline int rcu_read_lock_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return lock_is_held(&rcu_lock_map); } @@ -313,6 +327,9 @@ extern int rcu_read_lock_bh_held(void); * notice an extended quiescent state to other CPUs that started a grace * period. Otherwise we would delay any grace period as long as we run in * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. */ #ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) @@ -323,6 +340,8 @@ static inline int rcu_read_lock_sched_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; if (debug_locks) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e1b005918bbb..9a323728e60c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -99,15 +99,18 @@ long srcu_batches_completed(struct srcu_struct *sp); * power mode. This way we can notice an extended quiescent state to * other CPUs that started a grace period. Otherwise we would delay any * grace period as long as we run in the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. */ static inline int srcu_read_lock_held(struct srcu_struct *sp) { - if (rcu_is_cpu_idle()) - return 0; - if (!debug_lockdep_rcu_enabled()) return 1; - + if (rcu_is_cpu_idle()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return lock_is_held(&sp->dep_map); } diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2bc4e135ff23..a86f1741cc27 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -88,6 +88,9 @@ EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); * section. * * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + * + * Note that rcu_read_lock() is disallowed if the CPU is either idle or + * offline from an RCU perspective, so check for those as well. */ int rcu_read_lock_bh_held(void) { @@ -95,6 +98,8 @@ int rcu_read_lock_bh_held(void) return 1; if (rcu_is_cpu_idle()) return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; return in_softirq() || irqs_disabled(); } EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index dccd2f78db4e..bcf7db2f2fd2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -591,6 +591,35 @@ int rcu_is_cpu_idle(void) } EXPORT_SYMBOL(rcu_is_cpu_idle); +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Is the current CPU online? Disable preemption to avoid false positives + * that could otherwise happen due to the current CPU number being sampled, + * this task being preempted, its old CPU being taken offline, resuming + * on some other CPU, then determining that its old CPU is now offline. + * It is OK to use RCU on an offline processor during initial boot, hence + * the check for rcu_scheduler_fully_active. + * + * Disable checking if in an NMI handler because we cannot safely report + * errors from NMI handlers anyway. + */ +bool rcu_lockdep_current_cpu_online(void) +{ + bool ret; + + if (in_nmi()) + return 1; + preempt_disable(); + ret = cpu_online(smp_processor_id()) || + !rcu_scheduler_fully_active; + preempt_enable(); + return ret; +} +EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #endif /* #ifdef CONFIG_PROVE_RCU */ /** diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index aa93b074bb2f..cecea84f4f3f 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1946,6 +1946,7 @@ void synchronize_sched_expedited(void) /* Note that atomic_inc_return() implies full memory barrier. */ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); /* * Each pass through the following loop attempts to force a -- cgit v1.2.3 From 3d3b7db0a22085cfc05c3318b9874f7fb8266d18 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 17:05:46 -0800 Subject: rcu: Move synchronize_sched_expedited() to rcutree.c Now that TREE_RCU and TREE_PREEMPT_RCU no longer do anything different for the single-CPU case, there is no need for multiple definitions of synchronize_sched_expedited(). It is no longer in any sense a plug-in, so move it from kernel/rcutree_plugin.h to kernel/rcutree.c. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/rcutree_plugin.h | 116 ----------------------------------------------- 2 files changed, 117 insertions(+), 116 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index bcf7db2f2fd2..05470d4caba3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -50,6 +50,8 @@ #include #include #include +#include +#include #include "rcutree.h" #include @@ -1918,6 +1920,121 @@ void synchronize_rcu_bh(void) } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started); + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index cecea84f4f3f..98ce17cf1fb5 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -25,7 +25,6 @@ */ #include -#include #define RCU_KTHREAD_PRIO 1 @@ -1888,121 +1887,6 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); -static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); - -static int synchronize_sched_expedited_cpu_stop(void *data) -{ - /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. - */ - smp_mb(); /* See above comment block. */ - return 0; -} - -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. - * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. - * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). - */ -void synchronize_sched_expedited(void) -{ - int firstsnap, s, snap, trycount = 0; - - /* Note that atomic_inc_return() implies full memory barrier. */ - firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); - get_online_cpus(); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); - - /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. - */ - while (try_stop_cpus(cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) - udelay(trycount * num_online_cpus()); - else { - synchronize_sched(); - return; - } - - /* Check to see if someone else did our work for us. */ - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { - smp_mb(); /* ensure test happens before caller kfree */ - return; - } - - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We subtract - * 1 to get the same token that the last incrementer got. - * We retry after they started, so our grace period works - * for them, and they started after our first try, so their - * grace period works for us. - */ - get_online_cpus(); - snap = atomic_read(&sync_sched_expedited_started); - smp_mb(); /* ensure read is before try_stop_cpus(). */ - } - - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did beat us to the punch. - */ - do { - s = atomic_read(&sync_sched_expedited_done); - if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { - smp_mb(); /* ensure test happens before caller kfree */ - break; - } - } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); - - put_online_cpus(); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - #if !defined(CONFIG_RCU_FAST_NO_HZ) /* -- cgit v1.2.3 From c0cfbbb0d4fca14b828a7635a59784adfba8989d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Jan 2012 17:23:35 -0800 Subject: rcu: No interrupt disabling for rcu_prepare_for_idle() The rcu_prepare_for_idle() function is always called with interrupts disabled, so there is no reason to disable interrupts again within rcu_prepare_for_idle(). Therefore, this commit removes all of the interrupt disabling, also removing a latent disabling-unbalance bug. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 98ce17cf1fb5..5e25ee327ccb 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2096,10 +2096,6 @@ static void rcu_cleanup_after_idle(int cpu) */ static void rcu_prepare_for_idle(int cpu) { - unsigned long flags; - - local_irq_save(flags); - /* * If there are no callbacks on this CPU, enter dyntick-idle mode. * Also reset state to avoid prejudicing later attempts. @@ -2107,7 +2103,6 @@ static void rcu_prepare_for_idle(int cpu) if (!rcu_cpu_has_callbacks(cpu)) { per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; per_cpu(rcu_dyntick_drain, cpu) = 0; - local_irq_restore(flags); trace_rcu_prep_idle("No callbacks"); return; } @@ -2117,7 +2112,6 @@ static void rcu_prepare_for_idle(int cpu) * refrained from disabling the scheduling-clock tick. */ if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) { - local_irq_restore(flags); trace_rcu_prep_idle("In holdoff"); return; } @@ -2142,7 +2136,6 @@ static void rcu_prepare_for_idle(int cpu) } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { /* We have hit the limit, so time to give up. */ per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - local_irq_restore(flags); trace_rcu_prep_idle("Begin holdoff"); invoke_rcu_core(); /* Force the CPU out of dyntick-idle. */ return; @@ -2154,23 +2147,17 @@ static void rcu_prepare_for_idle(int cpu) */ #ifdef CONFIG_TREE_PREEMPT_RCU if (per_cpu(rcu_preempt_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_preempt_qs(cpu); force_quiescent_state(&rcu_preempt_state, 0); - local_irq_save(flags); } #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ if (per_cpu(rcu_sched_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_sched_qs(cpu); force_quiescent_state(&rcu_sched_state, 0); - local_irq_save(flags); } if (per_cpu(rcu_bh_data, cpu).nxtlist) { - local_irq_restore(flags); rcu_bh_qs(cpu); force_quiescent_state(&rcu_bh_state, 0); - local_irq_save(flags); } /* @@ -2178,13 +2165,10 @@ static void rcu_prepare_for_idle(int cpu) * So try forcing the callbacks through the grace period. */ if (rcu_cpu_has_callbacks(cpu)) { - local_irq_restore(flags); trace_rcu_prep_idle("More callbacks"); invoke_rcu_core(); - } else { - local_irq_restore(flags); + } else trace_rcu_prep_idle("Callbacks drained"); - } } #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -- cgit v1.2.3 From c5fdcec927ee31fc96e92339c3a83ac6e0725289 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Jan 2012 08:46:32 -0800 Subject: lockdep: Add CPU-idle/offline warning to lockdep-RCU splat It is illegal to use RCU from a CPU that has reported idleness or offlinedness to RCU. However, it can be quite difficult to determine from a stack trace whether or not a given CPU is idle or offline. Therefore, this commit adds idle/offline diagnostics to the lockdep-RCU error message. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/lockdep.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 8889f7dd7c46..ea9ee4518c35 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -4176,7 +4176,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) printk("-------------------------------\n"); printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); - printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); + printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + !rcu_lockdep_current_cpu_online() + ? "RCU used illegally from offline CPU!\n" + : rcu_is_cpu_idle() + ? "RCU used illegally from idle CPU!\n" + : "", + rcu_scheduler_active, debug_locks); /* * If a CPU is in the RCU-free window in idle (ie: in the section -- cgit v1.2.3 From 2036d94a7b61ca5032ce90f2bda06afec0fe713e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 30 Jan 2012 17:02:47 -0800 Subject: rcu: Rework detection of use of RCU by offline CPUs Because newly offlined CPUs continue executing after completing the CPU_DYING notifiers, they legitimately enter the scheduler and use RCU while appearing to be offline. This calls for a more sophisticated approach as follows: 1. RCU marks the CPU online during the CPU_UP_PREPARE phase. 2. RCU marks the CPU offline during the CPU_DEAD phase. 3. Diagnostics regarding use of read-side RCU by offline CPUs use RCU's accounting rather than the cpu_online_map. (Note that __call_rcu() still uses cpu_online_map to detect illegal invocations within CPU_DYING notifiers.) 4. Offline CPUs are prevented from hanging the system by force_quiescent_state(), which pays attention to cpu_online_map. Some additional work (in a later commit) will be needed to guarantee that force_quiescent_state() waits a full jiffy before assuming that a CPU is offline, for example, when called from idle entry. (This commit also makes the one-jiffy wait explicit, since the old-style implicit wait can now be defeated by RCU_FAST_NO_HZ and by rcutorture.) This approach avoids the false positives encountered when attempting to use more exact classification of CPU online/offline state. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- Documentation/RCU/trace.txt | 36 +++++++------- kernel/rcutree.c | 113 ++++++++++++++++++++++++++------------------ kernel/rcutree.h | 1 - kernel/rcutree_plugin.h | 2 +- kernel/rcutree_trace.c | 6 +-- 5 files changed, 87 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 49587abfc2f7..f6f15ce39903 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -33,23 +33,23 @@ rcu/rcuboost: The output of "cat rcu/rcudata" looks as follows: rcu_sched: - 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ri=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 - 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ri=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 - 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ri=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 - 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ri=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 - 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ri=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 - 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ri=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 - 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ri=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 - 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ri=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 + 0 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=545/1/0 df=50 of=0 ql=163 qs=NRW. kt=0/W/0 ktl=ebc3 b=10 ci=153737 co=0 ca=0 + 1 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=967/1/0 df=58 of=0 ql=634 qs=NRW. kt=0/W/1 ktl=58c b=10 ci=191037 co=0 ca=0 + 2 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1081/1/0 df=175 of=0 ql=74 qs=N.W. kt=0/W/2 ktl=da94 b=10 ci=75991 co=0 ca=0 + 3 c=20942 g=20943 pq=1 pgp=20942 qp=1 dt=1846/0/0 df=404 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=72261 co=0 ca=0 + 4 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=369/1/0 df=83 of=0 ql=48 qs=N.W. kt=0/W/4 ktl=e0e7 b=10 ci=128365 co=0 ca=0 + 5 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=381/1/0 df=64 of=0 ql=169 qs=NRW. kt=0/W/5 ktl=fb2f b=10 ci=164360 co=0 ca=0 + 6 c=20972 g=20973 pq=1 pgp=20973 qp=0 dt=1037/1/0 df=183 of=0 ql=62 qs=N.W. kt=0/W/6 ktl=d2ad b=10 ci=65663 co=0 ca=0 + 7 c=20897 g=20897 pq=1 pgp=20896 qp=0 dt=1572/0/0 df=382 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=75006 co=0 ca=0 rcu_bh: - 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 - 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ri=1 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 - 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 - 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 - 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 - 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ri=1 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 - 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ri=1 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 - 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ri=1 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 + 0 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=545/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/0 ktl=ebc3 b=10 ci=0 co=0 ca=0 + 1 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=967/1/0 df=3 of=0 ql=0 qs=.... kt=0/W/1 ktl=58c b=10 ci=151 co=0 ca=0 + 2 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1081/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/2 ktl=da94 b=10 ci=0 co=0 ca=0 + 3 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1846/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/3 ktl=d1cd b=10 ci=0 co=0 ca=0 + 4 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=369/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/4 ktl=e0e7 b=10 ci=0 co=0 ca=0 + 5 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=381/1/0 df=4 of=0 ql=0 qs=.... kt=0/W/5 ktl=fb2f b=10 ci=0 co=0 ca=0 + 6 c=1480 g=1480 pq=1 pgp=1480 qp=0 dt=1037/1/0 df=6 of=0 ql=0 qs=.... kt=0/W/6 ktl=d2ad b=10 ci=0 co=0 ca=0 + 7 c=1474 g=1474 pq=1 pgp=1473 qp=0 dt=1572/0/0 df=8 of=0 ql=0 qs=.... kt=0/W/7 ktl=cf15 b=10 ci=0 co=0 ca=0 The first section lists the rcu_data structures for rcu_sched, the second for rcu_bh. Note that CONFIG_TREE_PREEMPT_RCU kernels will have an @@ -119,10 +119,6 @@ o "of" is the number of times that some other CPU has forced a CPU is offline when it is really alive and kicking) is a fatal error, so it makes sense to err conservatively. -o "ri" is the number of times that RCU has seen fit to send a - reschedule IPI to this CPU in order to get it to report a - quiescent state. - o "ql" is the number of RCU callbacks currently residing on this CPU. This is the total number of callbacks, regardless of what state they are in (new, waiting for grace period to diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 05470d4caba3..708469a06860 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -320,25 +320,18 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static int rcu_implicit_offline_qs(struct rcu_data *rdp) { /* - * If the CPU is offline, it is in a quiescent state. We can - * trust its state not to change because interrupts are disabled. + * If the CPU is offline for more than a jiffy, it is in a quiescent + * state. We can trust its state not to change because interrupts + * are disabled. The reason for the jiffy's worth of slack is to + * handle CPUs initializing on the way up and finding their way + * to the idle loop on the way down. */ - if (cpu_is_offline(rdp->cpu)) { + if (cpu_is_offline(rdp->cpu) && + ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } - - /* - * The CPU is online, so send it a reschedule IPI. This forces - * it through the scheduler, and (inefficiently) also handles cases - * where idle loops fail to inform RCU about the CPU being idle. - */ - if (rdp->cpu != smp_processor_id()) - smp_send_reschedule(rdp->cpu); - else - set_need_resched(); - rdp->resched_ipi++; return 0; } @@ -601,19 +594,33 @@ EXPORT_SYMBOL(rcu_is_cpu_idle); * this task being preempted, its old CPU being taken offline, resuming * on some other CPU, then determining that its old CPU is now offline. * It is OK to use RCU on an offline processor during initial boot, hence - * the check for rcu_scheduler_fully_active. + * the check for rcu_scheduler_fully_active. Note also that it is OK + * for a CPU coming online to use RCU for one jiffy prior to marking itself + * online in the cpu_online_mask. Similarly, it is OK for a CPU going + * offline to continue to use RCU for one jiffy after marking itself + * offline in the cpu_online_mask. This leniency is necessary given the + * non-atomic nature of the online and offline processing, for example, + * the fact that a CPU enters the scheduler after completing the CPU_DYING + * notifiers. + * + * This is also why RCU internally marks CPUs online during the + * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. * * Disable checking if in an NMI handler because we cannot safely report * errors from NMI handlers anyway. */ bool rcu_lockdep_current_cpu_online(void) { + struct rcu_data *rdp; + struct rcu_node *rnp; bool ret; if (in_nmi()) return 1; preempt_disable(); - ret = cpu_online(smp_processor_id()) || + rdp = &__get_cpu_var(rcu_sched_data); + rnp = rdp->mynode; + ret = (rdp->grpmask & rnp->qsmaskinit) || !rcu_scheduler_fully_active; preempt_enable(); return ret; @@ -1308,14 +1315,12 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - unsigned long flags; int i; unsigned long mask; - int need_report; int receive_cpu = cpumask_any(cpu_online_mask); struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); - struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */ + RCU_TRACE(struct rcu_node *rnp = rdp->mynode); /* For dying CPU. */ /* First, adjust the counts. */ if (rdp->nxtlist != NULL) { @@ -1381,32 +1386,6 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) "cpuofl"); rcu_report_qs_rdp(smp_processor_id(), rsp, rdp, rsp->gpnum); /* Note that rcu_report_qs_rdp() might call trace_rcu_grace_period(). */ - - /* - * Remove the dying CPU from the bitmasks in the rcu_node - * hierarchy. Because we are in stop_machine() context, we - * automatically exclude ->onofflock critical sections. - */ - do { - raw_spin_lock_irqsave(&rnp->lock, flags); - rnp->qsmaskinit &= ~mask; - if (rnp->qsmaskinit != 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - break; - } - if (rnp == rdp->mynode) { - need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - if (need_report & RCU_OFL_TASKS_NORM_GP) - rcu_report_unblock_qs_rnp(rnp, flags); - else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - if (need_report & RCU_OFL_TASKS_EXP_GP) - rcu_report_exp_rnp(rsp, rnp, true); - } else - raw_spin_unlock_irqrestore(&rnp->lock, flags); - mask = rnp->grpmask; - rnp = rnp->parent; - } while (rnp != NULL); } /* @@ -1417,11 +1396,53 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) */ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { + unsigned long flags; + unsigned long mask; + int need_report = 0; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rnp. */ + /* Adjust any no-longer-needed kthreads. */ rcu_stop_cpu_kthread(cpu); rcu_node_kthread_setaffinity(rnp, -1); + + /* Remove the dying CPU from the bitmasks in the rcu_node hierarchy. */ + + /* Exclude any attempts to start a new grace period. */ + raw_spin_lock_irqsave(&rsp->onofflock, flags); + + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ + mask = rdp->grpmask; /* rnp->grplo is constant. */ + do { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit != 0) { + if (rnp != rdp->mynode) + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + break; + } + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + mask = rnp->grpmask; + rnp = rnp->parent; + } while (rnp != NULL); + + /* + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. + */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp, true); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index e2ac8ee415bb..cdd1be0a4072 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -289,7 +289,6 @@ struct rcu_data { /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ unsigned long offline_fqs; /* Kicked due to being offline. */ - unsigned long resched_ipi; /* Sent a resched IPI. */ /* 5) __rcu_pending() statistics. */ unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5e25ee327ccb..07f880445d8d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -610,7 +610,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, * absolutely necessary, but this is a good performance/complexity * tradeoff. */ - if (rcu_preempt_blocked_readers_cgp(rnp)) + if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) retval |= RCU_OFL_TASKS_NORM_GP; if (rcu_preempted_readers_exp(rnp)) retval |= RCU_OFL_TASKS_EXP_GP; diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index db0987c1e1bd..ed459edeff43 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -72,7 +72,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); - seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, " of=%lu", rdp->offline_fqs); seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != @@ -144,7 +144,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); - seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, ",%lu", rdp->offline_fqs); seq_printf(m, ",%ld,%ld,\"%c%c%c%c\"", rdp->qlen_lazy, rdp->qlen, ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]], @@ -168,7 +168,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) { seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); - seq_puts(m, "\"of\",\"ri\",\"qll\",\"ql\",\"qs\""); + seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); #ifdef CONFIG_RCU_BOOST seq_puts(m, "\"kt\",\"ktl\""); #endif /* #ifdef CONFIG_RCU_BOOST */ -- cgit v1.2.3 From 236fefafe5d3d34b78ed2ccf5510909716112326 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jan 2012 14:00:41 -0800 Subject: rcu: Call out dangers of expedited RCU primitives The expedited RCU primitives can be quite useful, but they have some high costs as well. This commit updates and creates docbook comments calling out the costs, and updates the RCU documentation as well. Signed-off-by: Paul E. McKenney --- Documentation/RCU/checklist.txt | 14 ++++++++++++++ include/linux/rcutree.h | 16 ++++++++++++++++ kernel/rcutree.c | 22 ++++++++++++++-------- kernel/rcutree_plugin.h | 20 ++++++++++++++++---- kernel/srcu.c | 27 +++++++++++++++++---------- 5 files changed, 77 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt index bff2d8be1e18..5c8d74968090 100644 --- a/Documentation/RCU/checklist.txt +++ b/Documentation/RCU/checklist.txt @@ -180,6 +180,20 @@ over a rather long period of time, but improvements are always welcome! operations that would not normally be undertaken while a real-time workload is running. + In particular, if you find yourself invoking one of the expedited + primitives repeatedly in a loop, please do everyone a favor: + Restructure your code so that it batches the updates, allowing + a single non-expedited primitive to cover the entire batch. + This will very likely be faster than the loop containing the + expedited primitive, and will be much much easier on the rest + of the system, especially to real-time workloads running on + the rest of the system. + + In addition, it is illegal to call the expedited forms from + a CPU-hotplug notifier, or while holding a lock that is acquired + by a CPU-hotplug notifier. Failing to observe this restriction + will result in deadlock. + 7. If the updater uses call_rcu() or synchronize_rcu(), then the corresponding readers must use rcu_read_lock() and rcu_read_unlock(). If the updater uses call_rcu_bh() or diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 73892483fd05..e8ee5dd0854c 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -63,6 +63,22 @@ extern void synchronize_rcu_expedited(void); void kfree_call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +/** + * synchronize_rcu_bh_expedited - Brute-force RCU-bh grace period + * + * Wait for an RCU-bh grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_rcu_bh_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_rcu_bh() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. + */ static inline void synchronize_rcu_bh_expedited(void) { synchronize_sched_expedited(); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 708469a06860..df0e3c1bb68e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1961,15 +1961,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data) return 0; } -/* - * Wait for an rcu-sched grace period to elapse, but use "big hammer" - * approach to force grace period to end quickly. This consumes - * significant time on all CPUs, and is thus not recommended for - * any sort of common-case code. +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. * - * Note that it is illegal to call this function while holding any - * lock that is acquired by a CPU-hotplug notifier. Failing to - * observe this restriction will result in deadlock. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. * * This implementation can be thought of as an application of ticket * locking to RCU, with sync_sched_expedited_started and diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 07f880445d8d..f7ceadf4986e 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -835,10 +835,22 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } -/* - * Wait for an rcu-preempt grace period, but expedite it. The basic idea - * is to invoke synchronize_sched_expedited() to push all the tasks to - * the ->blkd_tasks lists and wait for this list to drain. +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blkd_tasks lists and wait for this list to drain. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. + * In fact, if you are using synchronize_rcu_expedited() in a loop, + * please restructure your code to batch your updates, and then Use a + * single synchronize_rcu() instead. + * + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. */ void synchronize_rcu_expedited(void) { diff --git a/kernel/srcu.c b/kernel/srcu.c index 3f99fa0e8ed3..ba35f3a4a1f4 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -286,19 +286,26 @@ void synchronize_srcu(struct srcu_struct *sp) EXPORT_SYMBOL_GPL(synchronize_srcu); /** - * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * synchronize_srcu_expedited - Brute-force SRCU grace period * @sp: srcu_struct with which to synchronize. * - * Flip the completed counter, and wait for the old count to drain to zero. - * As with classic RCU, the updater must use some separate means of - * synchronizing concurrent updates. Can block; must be called from - * process context. + * Wait for an SRCU grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_srcu_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_srcu() instead. * - * Note that it is illegal to call synchronize_srcu_expedited() - * from the corresponding SRCU read-side critical section; doing so - * will result in deadlock. However, it is perfectly legal to call - * synchronize_srcu_expedited() on one srcu_struct from some other - * srcu_struct's read-side critical section. + * Note that it is illegal to call this function while holding any lock + * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal + * to call this function from a CPU-hotplug notifier. Failing to observe + * these restriction will result in deadlock. It is also illegal to call + * synchronize_srcu_expedited() from the corresponding SRCU read-side + * critical section; doing so will result in deadlock. However, it is + * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct + * from some other srcu_struct's read-side critical section, as long as + * the resulting graph of srcu_structs is acyclic. */ void synchronize_srcu_expedited(struct srcu_struct *sp) { -- cgit v1.2.3 From 7129d383d9f46eb8276bee5fb46df63f09a70130 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 31 Jan 2012 16:46:34 -0800 Subject: rcu: Trace only after NULL-pointer check Fix a bonehead error introduced when adding event tracing to rcutorture. Move the traces to follow the NULL-pointer checks. Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 5cfa23be43bd..ed1c72bd9c09 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1000,12 +1000,12 @@ static void rcu_torture_timer(unsigned long unused) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); return; } + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); spin_lock(&rand_lock); @@ -1063,13 +1063,13 @@ rcu_torture_reader(void *arg) rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || srcu_read_lock_held(&srcu_ctl)); - do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); schedule_timeout_interruptible(HZ); continue; } + do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu); if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); cur_ops->read_delay(&rand); -- cgit v1.2.3 From 3c1b1ce00d2702d6be9b92233822e560f37ea780 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 2 Feb 2012 07:52:54 -0800 Subject: PTR_ERR should be called before its argument is cleared. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ expression e,e1; constant c; @@ *e = c ... when != e = e1 when != &e when != true IS_ERR(e) *PTR_ERR(e) // Signed-off-by: Julia Lawall Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index ed1c72bd9c09..a89b381a8c6e 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1465,12 +1465,15 @@ rcu_torture_onoff(void *arg) static int __cpuinit rcu_torture_onoff_init(void) { + int ret; + if (onoff_interval <= 0) return 0; onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); if (IS_ERR(onoff_task)) { + ret = PTR_ERR(onoff_task); onoff_task = NULL; - return PTR_ERR(onoff_task); + return ret; } return 0; } -- cgit v1.2.3 From ce5df97be530e4746bf9a4ac14589a1cfdfd8efc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Feb 2012 08:26:06 -0800 Subject: rcu: Remove redundant check for rcu_head misalignment There is now an unconditional check for rcu_head misalignment in __call_rcu(), so remove the old conditional one in debug_rcu_head_queue(). Reported-by: Josh Triplett Signed-off-by: Paul E. McKenney --- kernel/rcu.h | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu.h b/kernel/rcu.h index a074b0b43fc2..30876f4063b6 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -50,7 +50,6 @@ extern struct debug_obj_descr rcuhead_debug_descr; static inline void debug_rcu_head_queue(struct rcu_head *head) { - WARN_ON_ONCE((unsigned long)head & 0x3); debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, -- cgit v1.2.3 From 29e37d814188ac8d60f2120583704d3ef6d634b4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Nov 2011 16:55:56 -0800 Subject: rcu: Allow nesting of rcu_idle_enter() and rcu_idle_exit() Use of RCU in the idle loop is incorrect, quite a few instances of just that have made their way into mainline, primarily event tracing. The problem with RCU read-side critical sections on CPUs that RCU believes to be idle is that RCU is completely ignoring the CPU, along with any attempts and RCU read-side critical sections. The approaches of eliminating the offending uses and of pushing the definition of idle down beyond the offending uses have both proved impractical. The new approach is to encapsulate offending uses of RCU with rcu_idle_exit() and rcu_idle_enter(), but this requires nesting for code that is invoked both during idle and and during normal execution. Therefore, this commit modifies rcu_idle_enter() and rcu_idle_exit() to permit nesting. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Deepthi Dharwar --- kernel/rcu.h | 21 ++++++++++++++++++++- kernel/rcutiny.c | 16 ++++++++++++---- kernel/rcutree.c | 21 ++++++++++++++------- 3 files changed, 46 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu.h b/kernel/rcu.h index 30876f4063b6..8ba99cdc6515 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -33,8 +33,27 @@ * Process-level increment to ->dynticks_nesting field. This allows for * architectures that use half-interrupts and half-exceptions from * process context. + * + * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH + * that counts the number of process-based reasons why RCU cannot + * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE + * is the value used to increment or decrement this field. + * + * The rest of the bits could in principle be used to count interrupts, + * but this would mean that a negative-one value in the interrupt + * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. + * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK + * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. + * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon + * initial exit from idle. */ -#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1) +#define DYNTICK_TASK_NEST_WIDTH 7 +#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) +#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) +#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) +#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) +#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ + DYNTICK_TASK_FLAG) /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 4eb34fcc2a75..c8b0e1582c04 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -53,7 +53,7 @@ static void __call_rcu(struct rcu_head *head, #include "rcutiny_plugin.h" -static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING; +static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ static void rcu_idle_enter_common(long long oldval) @@ -88,7 +88,12 @@ void rcu_idle_enter(void) local_irq_save(flags); oldval = rcu_dynticks_nesting; - rcu_dynticks_nesting = 0; + WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); + if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) + rcu_dynticks_nesting = 0; + else + rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_idle_enter_common(oldval); local_irq_restore(flags); } @@ -140,8 +145,11 @@ void rcu_idle_exit(void) local_irq_save(flags); oldval = rcu_dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rcu_dynticks_nesting = DYNTICK_TASK_NESTING; + WARN_ON_ONCE(rcu_dynticks_nesting < 0); + if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) + rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_idle_exit_common(oldval); local_irq_restore(flags); } diff --git a/kernel/rcutree.c b/kernel/rcutree.c index df0e3c1bb68e..92b47760edf3 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -198,7 +198,7 @@ void rcu_note_context_switch(int cpu) EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_NESTING, + .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), }; @@ -394,7 +394,11 @@ void rcu_idle_enter(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting = 0; + WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); + if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rdtp->dynticks_nesting = 0; + else + rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } @@ -467,7 +471,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to + * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to * allow for the possibility of usermode upcalls messing up our count * of interrupt nesting level during the busy period that is just * now starting. @@ -481,8 +485,11 @@ void rcu_idle_exit(void) local_irq_save(flags); rdtp = &__get_cpu_var(rcu_dynticks); oldval = rdtp->dynticks_nesting; - WARN_ON_ONCE(oldval != 0); - rdtp->dynticks_nesting = DYNTICK_TASK_NESTING; + WARN_ON_ONCE(oldval < 0); + if (oldval & DYNTICK_TASK_NEST_MASK) + rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; + else + rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } @@ -2253,7 +2260,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_lazy = 0; rdp->qlen = 0; rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; @@ -2281,7 +2288,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING; + rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); rcu_prepare_for_idle_init(cpu); -- cgit v1.2.3 From 8a2ecf474d3ee8dd5d001490349e422cec52f39f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Feb 2012 15:42:04 -0800 Subject: rcu: Add RCU_NONIDLE() for idle-loop RCU read-side critical sections RCU, RCU-bh, and RCU-sched read-side critical sections are forbidden in the inner idle loop, that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU will happily ignore any such read-side critical sections. However, things like powertop need tracepoints in the inner idle loop. This commit therefore provides an RCU_NONIDLE() macro that can be used to wrap code in the idle loop that requires RCU read-side critical sections. Suggested-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Acked-by: Deepthi Dharwar --- include/linux/rcupdate.h | 27 +++++++++++++++++++++++++++ kernel/rcutiny.c | 2 ++ kernel/rcutree.c | 2 ++ 3 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6ee663c8745a..937217425c47 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -190,6 +190,33 @@ extern void rcu_idle_exit(void); extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +/** + * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers + * @a: Code that RCU needs to pay attention to. + * + * RCU, RCU-bh, and RCU-sched read-side critical sections are forbidden + * in the inner idle loop, that is, between the rcu_idle_enter() and + * the rcu_idle_exit() -- RCU will happily ignore any such read-side + * critical sections. However, things like powertop need tracepoints + * in the inner idle loop. + * + * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) + * will tell RCU that it needs to pay attending, invoke its argument + * (in this example, a call to the do_something_with_RCU() function), + * and then tell RCU to go back to ignoring this CPU. It is permissible + * to nest RCU_NONIDLE() wrappers, but the nesting level is currently + * quite limited. If deeper nesting is required, it will be necessary + * to adjust DYNTICK_TASK_NESTING_VALUE accordingly. + * + * This macro may be used from process-level code only. + */ +#define RCU_NONIDLE(a) \ + do { \ + rcu_idle_exit(); \ + do { a; } while (0); \ + rcu_idle_enter(); \ + } while (0) + /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index c8b0e1582c04..37a5444204d2 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -97,6 +97,7 @@ void rcu_idle_enter(void) rcu_idle_enter_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); /* * Exit an interrupt handler towards idle. @@ -153,6 +154,7 @@ void rcu_idle_exit(void) rcu_idle_exit_common(oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); /* * Enter an interrupt handler, moving away from idle. diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 92b47760edf3..eacc10b03531 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -402,6 +402,7 @@ void rcu_idle_enter(void) rcu_idle_enter_common(rdtp, oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_enter); /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -493,6 +494,7 @@ void rcu_idle_exit(void) rcu_idle_exit_common(rdtp, oldval); local_irq_restore(flags); } +EXPORT_SYMBOL_GPL(rcu_idle_exit); /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle -- cgit v1.2.3 From c3ce910b1456a45fa88959af3735bd6b285e54af Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 14 Feb 2012 10:12:54 -0800 Subject: rcu: Eliminate softirq-mediated RCU_FAST_NO_HZ idle-entry loop If a softirq is pending, the current CPU has RCU callbacks pending, and RCU does not immediately need anything from this CPU, then the current code resets the RCU_FAST_NO_HZ state machine. This means that upon exit from the subsequent softirq handler, RCU_FAST_NO_HZ will try really hard to force RCU into dyntick-idle mode. And if the same conditions hold after a few tries (determined by RCU_IDLE_OPT_FLUSHES), the same situation can repeat, possibly endlessly. This scenario is not particularly good for battery lifetime. This commit therefore suppresses the early exit from the RCU_FAST_NO_HZ state machine in the case where there is a softirq pending. This change forces the state machine to retain its memory, and to enter holdoff if this condition persists. Reported-by: "Abou Gazala, Neven M" Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index f7ceadf4986e..392a65136a72 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2133,7 +2133,8 @@ static void rcu_prepare_for_idle(int cpu) /* First time through, initialize the counter. */ per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES; } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES && - !rcu_pending(cpu)) { + !rcu_pending(cpu) && + !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; -- cgit v1.2.3 From 696a02cc16b182dd78b1f395ae336f449cc90f11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 16 Feb 2012 21:59:33 -0800 Subject: rcu: Hold off RCU_FAST_NO_HZ after timer posted This commit handles workloads that transition quickly between idle and non-idle, and where the CPU's callbacks cannot be invoked, but where RCU does not have anything immediate for the CPU to do. Without this patch, the RCU_FAST_NO_HZ code can be invoked repeatedly on each entry to idle. The commit sets the per-CPU rcu_dyntick_holdoff variable to hold off further attempts for a tick. Reported-by: "Abou Gazala, Neven M" Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney --- kernel/rcutree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 392a65136a72..c023464816be 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2138,7 +2138,7 @@ static void rcu_prepare_for_idle(int cpu) /* Can we go dyntick-idle despite still having callbacks? */ trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; if (rcu_cpu_has_nonlazy_callbacks(cpu)) hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu), rcu_idle_gp_wait, HRTIMER_MODE_REL); -- cgit v1.2.3 From 9a4b430451bb6d8d6b7dcdfbee0e1330b7c475a6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Feb 2012 03:37:26 +0100 Subject: cgroup: Remove wrong comment on cgroup_enable_task_cg_list() Remove the stale comment about RCU protection. Many callers (all of them?) of cgroup_enable_task_cg_list() don't seem to be in an RCU read side critical section. Besides, RCU is not helpful to protect against while_each_thread(). Signed-off-by: Frederic Weisbecker Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Mandeep Singh Baines Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul E. McKenney --- kernel/cgroup.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 865d89a580c7..6e4eb4312571 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2701,9 +2701,6 @@ static void cgroup_advance_iter(struct cgroup *cgrp, * using their cgroups capability, we don't maintain the lists running * through each css_set to its tasks until we see the list actually * used - in other words after the first call to cgroup_iter_start(). - * - * The tasklist_lock is not held here, as do_each_thread() and - * while_each_thread() are protected by RCU. */ static void cgroup_enable_task_cg_lists(void) { -- cgit v1.2.3 From 3ce3230a0cff484e5130153f244d4fb8a56b3a8b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Feb 2012 03:37:27 +0100 Subject: cgroup: Walk task list under tasklist_lock in cgroup_enable_task_cg_list Walking through the tasklist in cgroup_enable_task_cg_list() inside an RCU read side critical section is not enough because: - RCU is not (yet) safe against while_each_thread() - If we use only RCU, a forking task that has passed cgroup_post_fork() without seeing use_task_css_set_links == 1 is not guaranteed to have its child immediately visible in the tasklist if we walk through it remotely with RCU. In this case it will be missing in its css_set's task list. Thus we need to traverse the list (unfortunately) under the tasklist_lock. It makes us safe against while_each_thread() and also make sure we see all forked task that have been added to the tasklist. As a secondary effect, reading and writing use_task_css_set_links are now well ordered against tasklist traversing and modification. The new layout is: CPU 0 CPU 1 use_task_css_set_links = 1 write_lock(tasklist_lock) read_lock(tasklist_lock) add task to tasklist do_each_thread() { write_unlock(tasklist_lock) add thread to css set links if (use_task_css_set_links) } while_each_thread() add thread to css set links read_unlock(tasklist_lock) If CPU 0 traverse the list after the task has been added to the tasklist then it is correctly added to the css set links. OTOH if CPU 0 traverse the tasklist before the new task had the opportunity to be added to the tasklist because it was too early in the fork process, then CPU 1 catches up and add the task to the css set links after it added the task to the tasklist. The right value of use_task_css_set_links is guaranteed to be visible from CPU 1 due to the LOCK/UNLOCK implicit barrier properties: the read_unlock on CPU 0 makes the write on use_task_css_set_links happening and the write_lock on CPU 1 make the read of use_task_css_set_links that comes afterward to return the correct value. Signed-off-by: Frederic Weisbecker Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: Mandeep Singh Baines Cc: Oleg Nesterov Cc: Andrew Morton Cc: Paul E. McKenney --- kernel/cgroup.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6e4eb4312571..c6877fe9a831 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2707,6 +2707,14 @@ static void cgroup_enable_task_cg_lists(void) struct task_struct *p, *g; write_lock(&css_set_lock); use_task_css_set_links = 1; + /* + * We need tasklist_lock because RCU is not safe against + * while_each_thread(). Besides, a forking task that has passed + * cgroup_post_fork() without seeing use_task_css_set_links = 1 + * is not guaranteed to have its child immediately visible in the + * tasklist if we walk through it with RCU. + */ + read_lock(&tasklist_lock); do_each_thread(g, p) { task_lock(p); /* @@ -2718,6 +2726,7 @@ static void cgroup_enable_task_cg_lists(void) list_add(&p->cg_list, &p->cgroups->tasks); task_unlock(p); } while_each_thread(g, p); + read_unlock(&tasklist_lock); write_unlock(&css_set_lock); } @@ -4522,6 +4531,17 @@ void cgroup_fork_callbacks(struct task_struct *child) */ void cgroup_post_fork(struct task_struct *child) { + /* + * use_task_css_set_links is set to 1 before we walk the tasklist + * under the tasklist_lock and we read it here after we added the child + * to the tasklist under the tasklist_lock as well. If the child wasn't + * yet in the tasklist when we walked through it from + * cgroup_enable_task_cg_lists(), then use_task_css_set_links value + * should be visible now due to the paired locking and barriers implied + * by LOCK/UNLOCK: it is written before the tasklist_lock unlock + * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock + * lock on fork. + */ if (use_task_css_set_links) { write_lock(&css_set_lock); if (list_empty(&child->cg_list)) { -- cgit v1.2.3 From 1cc85961e214773cb7d7f2ccbe3bc644dd466df0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 17 Feb 2012 13:20:31 -0800 Subject: rcu: Stop spurious warnings from synchronize_sched_expedited synchronize_sched_expedited() is spamming CONFIG_DEBUG_PREEMPT=y users with an unintended warning from the cpu_is_offline() check: use raw_smp_processor_id() instead of smp_processor_id() there. Because the warning is under a get_online_cpus(), it is not possible for any CPUs to go offline, though it is quite possible that the task might migrate between the raw_smp_processor_id() and the check of cpu_is_offline(). This is not a problem because the task cannot migrate from an offline CPU to an online one or vice versa. The point of the check is to verify that synchronize_sched_expedited() is not called from an offline CPU, for example, from a CPU_DYING notifier, or, more important, from an outgoing CPU making its way from its CPU_DYING notifiers to the idle loop. Signed-off-by: Hugh Dickins Signed-off-by: Paul E. McKenney --- kernel/rcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index eacc10b03531..1050d6d3922c 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2014,7 +2014,7 @@ void synchronize_sched_expedited(void) /* Note that atomic_inc_return() implies full memory barrier. */ firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); get_online_cpus(); - WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); /* * Each pass through the following loop attempts to force a -- cgit v1.2.3 From fadf0464b83f91ba021a358c0238a0810c0d2a0b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 21 Feb 2012 15:02:53 -0500 Subject: jump label: Add a WARN() if jump label key count goes negative The count on a jump label key should never go negative. Add a WARN() to check for this condition. Signed-off-by: Jason Baron Cc: Gleb Natapov Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/3c68556121be4d1920417a3fe367da1ec38246b4.1329851692.git.jbaron@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 01d3b70fc98a..ed9654fd7d27 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -76,8 +76,11 @@ EXPORT_SYMBOL_GPL(jump_label_inc); static void __jump_label_dec(struct jump_label_key *key, unsigned long rate_limit, struct delayed_work *work) { - if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { + WARN(atomic_read(&key->enabled) < 0, + "jump label: negative count!\n"); return; + } if (rate_limit) { atomic_inc(&key->enabled); -- cgit v1.2.3 From a746e3cc984b0aa5b620dd07c1a433283b1835cf Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 21 Feb 2012 15:02:57 -0500 Subject: jump label: Fix compiler warning While cross-compiling on sparc64, I found: kernel/jump_label.c: In function 'jump_label_update': kernel/jump_label.c:447:40: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Fix by casting to 'unsigned long'. Signed-off-by: Jason Baron Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: a.p.zijlstra@chello.nl Link: http://lkml.kernel.org/r/08026cbc6df80619cae833ef1ebbbc43efab69ab.1329851692.git.jbaron@redhat.com Signed-off-by: Ingo Molnar --- kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ed9654fd7d27..543782e7cdd2 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -424,7 +424,7 @@ static void jump_label_update(struct jump_label_key *key, int enable) struct jump_entry *entry = key->entries, *stop = __stop___jump_table; #ifdef CONFIG_MODULES - struct module *mod = __module_address((jump_label_t)key); + struct module *mod = __module_address((unsigned long)key); __jump_label_mod_update(key, enable); -- cgit v1.2.3 From 8c79a045fd590a26e81e75f5d8d4ec5c7d23e565 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Jan 2012 14:51:37 +0100 Subject: sched/events: Revert trace_sched_stat_sleeptime() Commit 1ac9bc69 ("sched/tracing: Add a new tracepoint for sleeptime") added a new sched:sched_stat_sleeptime tracepoint. It's broken: the first sample we get on a task might be bad because of a stale sleep_start value that wasn't reset at the last task switch because the tracepoint was not active. It also breaks the existing schedstat samples due to the side effects of: - se->statistics.sleep_start = 0; ... - se->statistics.block_start = 0; Nor do I see means to fix it without adding overhead to the scheduler fast path, which I'm not willing to for the sake of redundant instrumentation. Most importantly, sleep time information can already be constructed by tracing context switches and wakeups, and taking the timestamp difference between the schedule-out, the wakeup and the schedule-in. Signed-off-by: Peter Zijlstra Cc: Andrew Vagin Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-pc4c9qhl8q6vg3bs4j6k0rbd@git.kernel.org Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 50 -------------------------------------------- kernel/sched/core.c | 1 - kernel/sched/fair.c | 2 ++ 3 files changed, 2 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 6ba596b07a72..e33ed1bfa113 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -370,56 +370,6 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); -#ifdef CREATE_TRACE_POINTS -static inline u64 trace_get_sleeptime(struct task_struct *tsk) -{ -#ifdef CONFIG_SCHEDSTATS - u64 block, sleep; - - block = tsk->se.statistics.block_start; - sleep = tsk->se.statistics.sleep_start; - tsk->se.statistics.block_start = 0; - tsk->se.statistics.sleep_start = 0; - - return block ? block : sleep ? sleep : 0; -#else - return 0; -#endif -} -#endif - -/* - * Tracepoint for accounting sleeptime (time the task is sleeping - * or waiting for I/O). - */ -TRACE_EVENT(sched_stat_sleeptime, - - TP_PROTO(struct task_struct *tsk, u64 now), - - TP_ARGS(tsk, now), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( u64, sleeptime ) - ), - - TP_fast_assign( - memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); - __entry->pid = tsk->pid; - __entry->sleeptime = trace_get_sleeptime(tsk); - __entry->sleeptime = __entry->sleeptime ? - now - __entry->sleeptime : 0; - ) - TP_perf_assign( - __perf_count(__entry->sleeptime); - ), - - TP_printk("comm=%s pid=%d sleeptime=%Lu [ns]", - __entry->comm, __entry->pid, - (unsigned long long)__entry->sleeptime) -); - /* * Tracepoint for showing priority inheritance modifying a tasks * priority. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..b342f57879e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1932,7 +1932,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); - trace_sched_stat_sleeptime(current, rq->clock); fire_sched_in_preempt_notifiers(current); if (mm) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..aca16b843b7e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1003,6 +1003,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.sleep_max)) se->statistics.sleep_max = delta; + se->statistics.sleep_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { @@ -1019,6 +1020,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (unlikely(delta > se->statistics.block_max)) se->statistics.block_max = delta; + se->statistics.block_start = 0; se->statistics.sum_sleep_runtime += delta; if (tsk) { -- cgit v1.2.3 From 62f6536a630affe3176deb48554d27ee58b65077 Mon Sep 17 00:00:00 2001 From: "Nikunj A. Dadhania" Date: Fri, 17 Feb 2012 08:34:30 +0530 Subject: sched: Remove rcu_read_lock/unlock() from select_idle_sibling() select_idle_sibling() is called from select_task_rq_fair(), which already has the RCU read lock held. Signed-off-by: Nikunj A. Dadhania Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20120217030409.11748.12491.stgit@abhimanyu Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4ab60a24ea49..6ce9992926d0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2670,8 +2670,6 @@ static int select_idle_sibling(struct task_struct *p, int target) /* * Otherwise, iterate the domains and find an elegible idle cpu. */ - rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { sg = sd->groups; @@ -2693,8 +2691,6 @@ next: } while (sg != sd->groups); } done: - rcu_read_unlock(); - return target; } -- cgit v1.2.3 From de5bdff7a72acc281219be2b8edeeca1fd81c542 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 16 Feb 2012 14:52:21 +0900 Subject: sched: Make initial SCHED_RR timeslace DEF_TIMESLICE Current the initial SCHED_RR timeslice of init_task is HZ, which means 1s, and is not same as the default SCHED_RR timeslice DEF_TIMESLICE. Change that initial timeslice to the DEF_TIMESLICE. Signed-off-by: Hiroshi Shimamoto [ s/DEF_TIMESLICE/RR_TIMESLICE/g ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F3C9995.3010800@ct.jp.nec.com Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 +- include/linux/sched.h | 6 ++++++ kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 4 ---- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9c66b1ada9d7..f994d51f70f2 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,7 +149,7 @@ extern struct cred init_cred; }, \ .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ - .time_slice = HZ, \ + .time_slice = RR_TIMESLICE, \ .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5dba2ad52431..eb5de466f099 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1234,6 +1234,12 @@ struct sched_rt_entity { #endif }; +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + struct rcu_node; enum perf_event_task_context { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f42ae7fb5ec5..f70206c2c802 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1972,7 +1972,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) if (--p->rt.time_slice) return; - p->rt.time_slice = DEF_TIMESLICE; + p->rt.time_slice = RR_TIMESLICE; /* * Requeue to the end of queue if we are not the only element @@ -2000,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) * Time slice is 0 for SCHED_FIFO tasks */ if (task->policy == SCHED_RR) - return DEF_TIMESLICE; + return RR_TIMESLICE; else return 0; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8a2c768d2f98..c0660a1a0cd1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -36,11 +36,7 @@ extern __read_mostly int scheduler_running; /* * These are the 'tuning knobs' of the scheduler: - * - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. */ -#define DEF_TIMESLICE (100 * HZ / 1000) /* * single value that denotes runtime == period, ie unlimited time. -- cgit v1.2.3 From 499e547057f5bba5cd6f87ebe59b05d0c59da905 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 22 Feb 2012 15:50:28 -0500 Subject: tracing/ring-buffer: Only have tracing_on disable tracing buffers As the ring-buffer code is being used by other facilities in the kernel, having tracing_on file disable *all* buffers is not a desired affect. It should only disable the ftrace buffers that are being used. Move the code into the trace.c file and use the buffer disabling for tracing_on() and tracing_off(). This way only the ftrace buffers will be affected by them and other kernel utilities will not be confused to why their output suddenly stopped. Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 3 + kernel/trace/ring_buffer.c | 157 +++++++++++++++++--------------------------- kernel/trace/trace.c | 107 ++++++++++++++++++++++++++++++ kernel/trace/trace.h | 1 + 4 files changed, 171 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 67be0376d8e3..7be2e88f23fd 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -151,6 +151,9 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_disable(struct ring_buffer *buffer); void ring_buffer_record_enable(struct ring_buffer *buffer); +void ring_buffer_record_off(struct ring_buffer *buffer); +void ring_buffer_record_on(struct ring_buffer *buffer); +int ring_buffer_record_is_on(struct ring_buffer *buffer); void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu); void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f5b7b5c1195b..cf8d11e91efd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -154,33 +154,10 @@ enum { static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; -#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) - -/** - * tracing_on - enable all tracing buffers - * - * This function enables all tracing buffers that may have been - * disabled with tracing_off. - */ -void tracing_on(void) -{ - set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_on); +/* Used for individual buffers (after the counter) */ +#define RB_BUFFER_OFF (1 << 20) -/** - * tracing_off - turn off all tracing buffers - * - * This function stops all tracing buffers from recording data. - * It does not disable any overhead the tracers themselves may - * be causing. This function simply causes all recording to - * the ring buffers to fail. - */ -void tracing_off(void) -{ - clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); -} -EXPORT_SYMBOL_GPL(tracing_off); +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) /** * tracing_off_permanent - permanently disable ring buffers @@ -193,15 +170,6 @@ void tracing_off_permanent(void) set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); } -/** - * tracing_is_on - show state of ring buffers enabled - */ -int tracing_is_on(void) -{ - return ring_buffer_flags == RB_BUFFERS_ON; -} -EXPORT_SYMBOL_GPL(tracing_is_on); - #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) @@ -2618,6 +2586,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable); +/** + * ring_buffer_record_off - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * This is different than ring_buffer_record_disable() as + * it works like an on/off switch, where as the disable() verison + * must be paired with a enable(). + */ +void ring_buffer_record_off(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd | RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_off); + +/** + * ring_buffer_record_on - restart writes into the buffer + * @buffer: The ring buffer to start writes to. + * + * This enables all writes to the buffer that was disabled by + * ring_buffer_record_off(). + * + * This is different than ring_buffer_record_enable() as + * it works like an on/off switch, where as the enable() verison + * must be paired with a disable(). + */ +void ring_buffer_record_on(struct ring_buffer *buffer) +{ + unsigned int rd; + unsigned int new_rd; + + do { + rd = atomic_read(&buffer->record_disabled); + new_rd = rd & ~RB_BUFFER_OFF; + } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_on); + +/** + * ring_buffer_record_is_on - return true if the ring buffer can write + * @buffer: The ring buffer to see if write is enabled + * + * Returns true if the ring buffer is in a state that it accepts writes. + */ +int ring_buffer_record_is_on(struct ring_buffer *buffer) +{ + return !atomic_read(&buffer->record_disabled); +} + /** * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer * @buffer: The ring buffer to stop writes to. @@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_read_page); -#ifdef CONFIG_TRACING -static ssize_t -rb_simple_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - char buf[64]; - int r; - - if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) - r = sprintf(buf, "permanently disabled\n"); - else - r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); - - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static ssize_t -rb_simple_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long *p = filp->private_data; - unsigned long val; - int ret; - - ret = kstrtoul_from_user(ubuf, cnt, 10, &val); - if (ret) - return ret; - - if (val) - set_bit(RB_BUFFERS_ON_BIT, p); - else - clear_bit(RB_BUFFERS_ON_BIT, p); - - (*ppos)++; - - return cnt; -} - -static const struct file_operations rb_simple_fops = { - .open = tracing_open_generic, - .read = rb_simple_read, - .write = rb_simple_write, - .llseek = default_llseek, -}; - - -static __init int rb_init_debugfs(void) -{ - struct dentry *d_tracer; - - d_tracer = tracing_init_dentry(); - - trace_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - - return 0; -} - -fs_initcall(rb_init_debugfs); -#endif - #ifdef CONFIG_HOTPLUG_CPU static int rb_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 10d5503f0d04..f3c13d63d064 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -351,6 +351,59 @@ static void wakeup_work_handler(struct work_struct *work) static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler); +/** + * tracing_on - enable tracing buffers + * + * This function enables tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + if (global_trace.buffer) + ring_buffer_record_on(global_trace.buffer); + /* + * This flag is only looked at when buffers haven't been + * allocated yet. We don't really care about the race + * between setting this flag and actually turning + * on the buffer. + */ + global_trace.buffer_disabled = 0; +} +EXPORT_SYMBOL_GPL(tracing_on); + +/** + * tracing_off - turn off tracing buffers + * + * This function stops the tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + if (global_trace.buffer) + ring_buffer_record_on(global_trace.buffer); + /* + * This flag is only looked at when buffers haven't been + * allocated yet. We don't really care about the race + * between setting this flag and actually turning + * on the buffer. + */ + global_trace.buffer_disabled = 1; +} +EXPORT_SYMBOL_GPL(tracing_off); + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + if (global_trace.buffer) + return ring_buffer_record_is_on(global_trace.buffer); + return !global_trace.buffer_disabled; +} +EXPORT_SYMBOL_GPL(tracing_is_on); + /** * trace_wake_up - wake up tasks waiting for trace input * @@ -4567,6 +4620,55 @@ static __init void create_trace_options_dir(void) create_trace_option_core_file(trace_options[i], i); } +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ring_buffer *buffer = filp->private_data; + char buf[64]; + int r; + + if (buffer) + r = ring_buffer_record_is_on(buffer); + else + r = 0; + + r = sprintf(buf, "%d\n", r); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct ring_buffer *buffer = filp->private_data; + unsigned long val; + int ret; + + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); + if (ret) + return ret; + + if (buffer) { + if (val) + ring_buffer_record_on(buffer); + else + ring_buffer_record_off(buffer); + } + + (*ppos)++; + + return cnt; +} + +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, + .llseek = default_llseek, +}; + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; @@ -4626,6 +4728,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_clock", 0644, d_tracer, NULL, &trace_clock_fops); + trace_create_file("tracing_on", 0644, d_tracer, + global_trace.buffer, &rb_simple_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); @@ -4863,6 +4968,8 @@ __init static int tracer_alloc_buffers(void) goto out_free_cpumask; } global_trace.entries = ring_buffer_size(global_trace.buffer); + if (global_trace.buffer_disabled) + tracing_off(); #ifdef CONFIG_TRACER_MAX_TRACE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 54faec790bc1..ce887c0eca56 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -154,6 +154,7 @@ struct trace_array { struct ring_buffer *buffer; unsigned long entries; int cpu; + int buffer_disabled; cycle_t time_start; struct task_struct *waiter; struct trace_array_cpu *data[NR_CPUS]; -- cgit v1.2.3 From c5905afb0ee6550b42c49213da1c22d67316c194 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 24 Feb 2012 08:31:31 +0100 Subject: static keys: Introduce 'struct static_key', static_key_true()/false() and static_key_slow_[inc|dec]() So here's a boot tested patch on top of Jason's series that does all the cleanups I talked about and turns jump labels into a more intuitive to use facility. It should also address the various misconceptions and confusions that surround jump labels. Typical usage scenarios: #include struct static_key key = STATIC_KEY_INIT_TRUE; if (static_key_false(&key)) do unlikely code else do likely code Or: if (static_key_true(&key)) do likely code else do unlikely code The static key is modified via: static_key_slow_inc(&key); ... static_key_slow_dec(&key); The 'slow' prefix makes it abundantly clear that this is an expensive operation. I've updated all in-kernel code to use this everywhere. Note that I (intentionally) have not pushed through the rename blindly through to the lowest levels: the actual jump-label patching arch facility should be named like that, so we want to decouple jump labels from the static-key facility a bit. On non-jump-label enabled architectures static keys default to likely()/unlikely() branches. Signed-off-by: Ingo Molnar Acked-by: Jason Baron Acked-by: Steven Rostedt Cc: a.p.zijlstra@chello.nl Cc: mathieu.desnoyers@efficios.com Cc: davem@davemloft.net Cc: ddaney.cavm@gmail.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20120222085809.GA26397@elte.hu Signed-off-by: Ingo Molnar --- arch/Kconfig | 29 ++++--- arch/ia64/include/asm/paravirt.h | 6 +- arch/ia64/kernel/paravirt.c | 4 +- arch/mips/include/asm/jump_label.h | 2 +- arch/powerpc/include/asm/jump_label.h | 2 +- arch/s390/include/asm/jump_label.h | 2 +- arch/sparc/include/asm/jump_label.h | 2 +- arch/x86/include/asm/jump_label.h | 6 +- arch/x86/include/asm/paravirt.h | 6 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/paravirt.c | 4 +- arch/x86/kvm/mmu_audit.c | 8 +- include/linux/jump_label.h | 139 ++++++++++++++++++++++++---------- include/linux/netdevice.h | 4 +- include/linux/netfilter.h | 6 +- include/linux/perf_event.h | 12 +-- include/linux/static_key.h | 1 + include/linux/tracepoint.h | 8 +- include/net/sock.h | 6 +- kernel/events/core.c | 16 ++-- kernel/jump_label.c | 128 ++++++++++++++++++------------- kernel/sched/core.c | 18 ++--- kernel/sched/fair.c | 8 +- kernel/sched/sched.h | 14 ++-- kernel/tracepoint.c | 20 ++--- net/core/dev.c | 24 +++--- net/core/net-sysfs.c | 4 +- net/core/sock.c | 4 +- net/core/sysctl_net_core.c | 4 +- net/ipv4/tcp_memcontrol.c | 6 +- net/netfilter/core.c | 6 +- 31 files changed, 298 insertions(+), 205 deletions(-) create mode 100644 include/linux/static_key.h (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 4f55c736be11..5b448a74d0f7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -47,18 +47,29 @@ config KPROBES If in doubt, say "N". config JUMP_LABEL - bool "Optimize trace point call sites" + bool "Optimize very unlikely/likely branches" depends on HAVE_ARCH_JUMP_LABEL help + This option enables a transparent branch optimization that + makes certain almost-always-true or almost-always-false branch + conditions even cheaper to execute within the kernel. + + Certain performance-sensitive kernel code, such as trace points, + scheduler functionality, networking code and KVM have such + branches and include support for this optimization technique. + If it is detected that the compiler has support for "asm goto", - the kernel will compile trace point locations with just a - nop instruction. When trace points are enabled, the nop will - be converted to a jump to the trace function. This technique - lowers overhead and stress on the branch prediction of the - processor. - - On i386, options added to the compiler flags may increase - the size of the kernel slightly. + the kernel will compile such branches with just a nop + instruction. When the condition flag is toggled to true, the + nop will be converted to a jump instruction to execute the + conditional block of instructions. + + This technique lowers overhead and stress on the branch prediction + of the processor and generally makes the kernel faster. The update + of the condition is slower, but those are always very rare. + + ( On 32-bit x86, the necessary options added to the compiler + flags may increase the size of the kernel slightly. ) config OPTPROBES def_bool y diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h index 32551d304cd7..b149b88ea795 100644 --- a/arch/ia64/include/asm/paravirt.h +++ b/arch/ia64/include/asm/paravirt.h @@ -281,9 +281,9 @@ paravirt_init_missing_ticks_accounting(int cpu) pv_time_ops.init_missing_ticks_accounting(cpu); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline int paravirt_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c index 100868216c55..1b22f6de2932 100644 --- a/arch/ia64/kernel/paravirt.c +++ b/arch/ia64/kernel/paravirt.c @@ -634,8 +634,8 @@ struct pv_irq_ops pv_irq_ops = { * pv_time_ops * time operations */ -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static int ia64_native_do_steal_accounting(unsigned long *new_itm) diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h index 1881b316ca45..4d6d77ed9b9d 100644 --- a/arch/mips/include/asm/jump_label.h +++ b/arch/mips/include/asm/jump_label.h @@ -20,7 +20,7 @@ #define WORD_INSN ".word" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\tnop\n\t" "nop\n\t" diff --git a/arch/powerpc/include/asm/jump_label.h b/arch/powerpc/include/asm/jump_label.h index 938986e412f1..ae098c438f00 100644 --- a/arch/powerpc/include/asm/jump_label.h +++ b/arch/powerpc/include/asm/jump_label.h @@ -17,7 +17,7 @@ #define JUMP_ENTRY_TYPE stringify_in_c(FTR_ENTRY_LONG) #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h index 95a6cf2b5b67..6c32190dc73e 100644 --- a/arch/s390/include/asm/jump_label.h +++ b/arch/s390/include/asm/jump_label.h @@ -13,7 +13,7 @@ #define ASM_ALIGN ".balign 4" #endif -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("0: brcl 0,0\n" ".pushsection __jump_table, \"aw\"\n" diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h index fc73a82366f8..5080d16a832f 100644 --- a/arch/sparc/include/asm/jump_label.h +++ b/arch/sparc/include/asm/jump_label.h @@ -7,7 +7,7 @@ #define JUMP_LABEL_NOP_SIZE 4 -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:\n\t" "nop\n\t" diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index a32b18ce6ead..3a16c1483b45 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -9,12 +9,12 @@ #define JUMP_LABEL_NOP_SIZE 5 -#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" +#define STATIC_KEY_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" -static __always_inline bool arch_static_branch(struct jump_label_key *key) +static __always_inline bool arch_static_branch(struct static_key *key) { asm goto("1:" - JUMP_LABEL_INITIAL_NOP + STATIC_KEY_INITIAL_NOP ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" _ASM_PTR "1b, %l[l_yes], %c0 \n\t" diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index a7d2db9a74fb..c0180fd372d2 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -230,9 +230,9 @@ static inline unsigned long long paravirt_sched_clock(void) return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } -struct jump_label_key; -extern struct jump_label_key paravirt_steal_enabled; -extern struct jump_label_key paravirt_steal_rq_enabled; +struct static_key; +extern struct static_key paravirt_steal_enabled; +extern struct static_key paravirt_steal_rq_enabled; static inline u64 paravirt_steal_clock(int cpu) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f0c6fd6f176b..694d801bf606 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -438,9 +438,9 @@ void __init kvm_guest_init(void) static __init int activate_jump_labels(void) { if (has_steal_clock) { - jump_label_inc(¶virt_steal_enabled); + static_key_slow_inc(¶virt_steal_enabled); if (steal_acc) - jump_label_inc(¶virt_steal_rq_enabled); + static_key_slow_inc(¶virt_steal_rq_enabled); } return 0; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index d90272e6bc40..ada2f99388dd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -202,8 +202,8 @@ static void native_flush_tlb_single(unsigned long addr) __native_flush_tlb_single(addr); } -struct jump_label_key paravirt_steal_enabled; -struct jump_label_key paravirt_steal_rq_enabled; +struct static_key paravirt_steal_enabled; +struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index fe15dcc07a6b..ea7b4fd34676 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -234,7 +234,7 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu) } static bool mmu_audit; -static struct jump_label_key mmu_audit_key; +static struct static_key mmu_audit_key; static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { @@ -250,7 +250,7 @@ static void __kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) static inline void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { - if (static_branch((&mmu_audit_key))) + if (static_key_false((&mmu_audit_key))) __kvm_mmu_audit(vcpu, point); } @@ -259,7 +259,7 @@ static void mmu_audit_enable(void) if (mmu_audit) return; - jump_label_inc(&mmu_audit_key); + static_key_slow_inc(&mmu_audit_key); mmu_audit = true; } @@ -268,7 +268,7 @@ static void mmu_audit_disable(void) if (!mmu_audit) return; - jump_label_dec(&mmu_audit_key); + static_key_slow_dec(&mmu_audit_key); mmu_audit = false; } diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index f7c69580fea7..2172da2d9bb4 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -9,15 +9,15 @@ * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support the result - * of a "if (static_branch(&key))" statement is a unconditional branch (which + * of a "if (static_key_false(&key))" statement is a unconditional branch (which * defaults to false - and the true block is placed out of line). * - * However at runtime we can change the 'static' branch target using - * jump_label_{inc,dec}(). These function as a 'reference' count on the key + * However at runtime we can change the branch target using + * static_key_slow_{inc,dec}(). These function as a 'reference' count on the key * object and for as long as there are references all branches referring to * that particular key will point to the (out of line) true block. * - * Since this relies on modifying code the jump_label_{inc,dec}() functions + * Since this relies on modifying code the static_key_slow_{inc,dec}() functions * must be considered absolute slow paths (machine wide synchronization etc.). * OTOH, since the affected branches are unconditional their runtime overhead * will be absolutely minimal, esp. in the default (off) case where the total @@ -26,12 +26,26 @@ * * When the control is directly exposed to userspace it is prudent to delay the * decrement to avoid high frequency code modifications which can (and do) - * cause significant performance degradation. Struct jump_label_key_deferred and - * jump_label_dec_deferred() provide for this. + * cause significant performance degradation. Struct static_key_deferred and + * static_key_slow_dec_deferred() provide for this. * * Lacking toolchain and or architecture support, it falls back to a simple * conditional branch. - */ + * + * struct static_key my_key = STATIC_KEY_INIT_TRUE; + * + * if (static_key_true(&my_key)) { + * } + * + * will result in the true case being in-line and starts the key with a single + * reference. Mixing static_key_true() and static_key_false() on the same key is not + * allowed. + * + * Not initializing the key (static data is initialized to 0s anyway) is the + * same as using STATIC_KEY_INIT_FALSE and static_key_false() is + * equivalent with static_branch(). + * +*/ #include #include @@ -39,16 +53,17 @@ #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) -struct jump_label_key { +struct static_key { atomic_t enabled; +/* Set lsb bit to 1 if branch is default true, 0 ot */ struct jump_entry *entries; #ifdef CONFIG_MODULES - struct jump_label_mod *next; + struct static_key_mod *next; #endif }; -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; unsigned long timeout; struct delayed_work work; }; @@ -66,13 +81,34 @@ struct module; #ifdef HAVE_JUMP_LABEL -#ifdef CONFIG_MODULES -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL, NULL} -#else -#define JUMP_LABEL_INIT {ATOMIC_INIT(0), NULL} -#endif +#define JUMP_LABEL_TRUE_BRANCH 1UL + +static +inline struct jump_entry *jump_label_get_entries(struct static_key *key) +{ + return (struct jump_entry *)((unsigned long)key->entries + & ~JUMP_LABEL_TRUE_BRANCH); +} + +static inline bool jump_label_get_branch_default(struct static_key *key) +{ + if ((unsigned long)key->entries & JUMP_LABEL_TRUE_BRANCH) + return true; + return false; +} + +static __always_inline bool static_key_false(struct static_key *key) +{ + return arch_static_branch(key); +} -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_true(struct static_key *key) +{ + return !static_key_false(key); +} + +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) { return arch_static_branch(key); } @@ -88,21 +124,24 @@ extern void arch_jump_label_transform(struct jump_entry *entry, extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); extern int jump_label_text_reserved(void *start, void *end); -extern void jump_label_inc(struct jump_label_key *key); -extern void jump_label_dec(struct jump_label_key *key); -extern void jump_label_dec_deferred(struct jump_label_key_deferred *key); -extern bool jump_label_enabled(struct jump_label_key *key); +extern void static_key_slow_inc(struct static_key *key); +extern void static_key_slow_dec(struct static_key *key); +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); -extern void jump_label_rate_limit(struct jump_label_key_deferred *key, - unsigned long rl); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0), .entries = (void *)0 }) #else /* !HAVE_JUMP_LABEL */ #include -#define JUMP_LABEL_INIT {ATOMIC_INIT(0)} - -struct jump_label_key { +struct static_key { atomic_t enabled; }; @@ -110,30 +149,45 @@ static __always_inline void jump_label_init(void) { } -struct jump_label_key_deferred { - struct jump_label_key key; +struct static_key_deferred { + struct static_key key; }; -static __always_inline bool static_branch(struct jump_label_key *key) +static __always_inline bool static_key_false(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static __always_inline bool static_key_true(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled))) + if (likely(atomic_read(&key->enabled)) > 0) return true; return false; } -static inline void jump_label_inc(struct jump_label_key *key) +/* Deprecated. Please use 'static_key_false() instead. */ +static __always_inline bool static_branch(struct static_key *key) +{ + if (unlikely(atomic_read(&key->enabled)) > 0) + return true; + return false; +} + +static inline void static_key_slow_inc(struct static_key *key) { atomic_inc(&key->enabled); } -static inline void jump_label_dec(struct jump_label_key *key) +static inline void static_key_slow_dec(struct static_key *key) { atomic_dec(&key->enabled); } -static inline void jump_label_dec_deferred(struct jump_label_key_deferred *key) +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) { - jump_label_dec(&key->key); + static_key_slow_dec(&key->key); } static inline int jump_label_text_reserved(void *start, void *end) @@ -144,9 +198,9 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool jump_label_enabled(struct jump_label_key *key) +static inline bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } static inline int jump_label_apply_nops(struct module *mod) @@ -154,13 +208,20 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void jump_label_rate_limit(struct jump_label_key_deferred *key, +static inline void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { } + +#define STATIC_KEY_INIT_TRUE ((struct static_key) \ + { .enabled = ATOMIC_INIT(1) }) +#define STATIC_KEY_INIT_FALSE ((struct static_key) \ + { .enabled = ATOMIC_INIT(0) }) + #endif /* HAVE_JUMP_LABEL */ -#define jump_label_key_enabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(1), }) -#define jump_label_key_disabled ((struct jump_label_key){ .enabled = ATOMIC_INIT(0), }) +#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE +#define jump_label_enabled static_key_enabled #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0eac07c95255..7dfaae7846ab 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -214,8 +214,8 @@ enum { #include #ifdef CONFIG_RPS -#include -extern struct jump_label_key rps_needed; +#include +extern struct static_key rps_needed; #endif struct neighbour; diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h index b809265607d0..29734be334c1 100644 --- a/include/linux/netfilter.h +++ b/include/linux/netfilter.h @@ -163,13 +163,13 @@ extern struct ctl_path nf_net_ipv4_netfilter_sysctl_path[]; extern struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; #if defined(CONFIG_JUMP_LABEL) -#include -extern struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +#include +extern struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; static inline bool nf_hooks_active(u_int8_t pf, unsigned int hook) { if (__builtin_constant_p(pf) && __builtin_constant_p(hook)) - return static_branch(&nf_hooks_needed[pf][hook]); + return static_key_false(&nf_hooks_needed[pf][hook]); return !list_empty(&nf_hooks[pf][hook]); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 412b790f5da6..0d21e6f1cf53 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -514,7 +514,7 @@ struct perf_guest_info_callbacks { #include #include #include -#include +#include #include #include @@ -1038,7 +1038,7 @@ static inline int is_software_event(struct perf_event *event) return event->pmu->task_ctx_nr == perf_sw_context; } -extern struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +extern struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; extern void __perf_sw_event(u32, u64, struct pt_regs *, u64); @@ -1066,7 +1066,7 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) { struct pt_regs hot_regs; - if (static_branch(&perf_swevent_enabled[event_id])) { + if (static_key_false(&perf_swevent_enabled[event_id])) { if (!regs) { perf_fetch_caller_regs(&hot_regs); regs = &hot_regs; @@ -1075,12 +1075,12 @@ perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr) } } -extern struct jump_label_key_deferred perf_sched_events; +extern struct static_key_deferred perf_sched_events; static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_in(prev, task); } @@ -1089,7 +1089,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, { perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, NULL, 0); - if (static_branch(&perf_sched_events.key)) + if (static_key_false(&perf_sched_events.key)) __perf_event_task_sched_out(prev, next); } diff --git a/include/linux/static_key.h b/include/linux/static_key.h new file mode 100644 index 000000000000..27bd3f8a0857 --- /dev/null +++ b/include/linux/static_key.h @@ -0,0 +1 @@ +#include diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index fc36da97ff7e..bd96ecd0e05c 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include struct module; struct tracepoint; @@ -29,7 +29,7 @@ struct tracepoint_func { struct tracepoint { const char *name; /* Tracepoint name */ - struct jump_label_key key; + struct static_key key; void (*regfunc)(void); void (*unregfunc)(void); struct tracepoint_func __rcu *funcs; @@ -145,7 +145,7 @@ static inline void tracepoint_synchronize_unregister(void) extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ - if (static_branch(&__tracepoint_##name.key)) \ + if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ @@ -188,7 +188,7 @@ static inline void tracepoint_synchronize_unregister(void) __attribute__((section("__tracepoints_strings"))) = #name; \ struct tracepoint __tracepoint_##name \ __attribute__((section("__tracepoints"))) = \ - { __tpstrtab_##name, JUMP_LABEL_INIT, reg, unreg, NULL };\ + { __tpstrtab_##name, STATIC_KEY_INIT_FALSE, reg, unreg, NULL };\ static struct tracepoint * const __tracepoint_ptr_##name __used \ __attribute__((section("__tracepoints_ptrs"))) = \ &__tracepoint_##name; diff --git a/include/net/sock.h b/include/net/sock.h index 91c1c8baf020..dcde2d9268cd 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -55,7 +55,7 @@ #include #include #include -#include +#include #include #include @@ -924,13 +924,13 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #endif /* SOCK_REFCNT_DEBUG */ #if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) -extern struct jump_label_key memcg_socket_limit_enabled; +extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) { return proto->proto_cgroup(parent_mem_cgroup(cg_proto->memcg)); } -#define mem_cgroup_sockets_enabled static_branch(&memcg_socket_limit_enabled) +#define mem_cgroup_sockets_enabled static_key_false(&memcg_socket_limit_enabled) #else #define mem_cgroup_sockets_enabled 0 static inline struct cg_proto *parent_cg_proto(struct proto *proto, diff --git a/kernel/events/core.c b/kernel/events/core.c index 7c3b9de55f6b..5e0f8bb89b2b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -128,7 +128,7 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct jump_label_key_deferred perf_sched_events __read_mostly; +struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static atomic_t nr_mmap_events __read_mostly; @@ -2769,7 +2769,7 @@ static void free_event(struct perf_event *event) if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -2780,7 +2780,7 @@ static void free_event(struct perf_event *event) put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_dec_deferred(&perf_sched_events); + static_key_slow_dec_deferred(&perf_sched_events); } } @@ -4982,7 +4982,7 @@ fail: return err; } -struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; +struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_event_destroy(struct perf_event *event) { @@ -4990,7 +4990,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - jump_label_dec(&perf_swevent_enabled[event_id]); + static_key_slow_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -5020,7 +5020,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - jump_label_inc(&perf_swevent_enabled[event_id]); + static_key_slow_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } @@ -5843,7 +5843,7 @@ done: if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -6081,7 +6081,7 @@ SYSCALL_DEFINE5(perf_event_open, * - that may need work on context switch */ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - jump_label_inc(&perf_sched_events.key); + static_key_slow_inc(&perf_sched_events.key); } /* diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 543782e7cdd2..bf9dcadbb53a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #ifdef HAVE_JUMP_LABEL @@ -29,10 +29,11 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool jump_label_enabled(struct jump_label_key *key) +bool static_key_enabled(struct static_key *key) { - return !!atomic_read(&key->enabled); + return (atomic_read(&key->enabled) > 0); } +EXPORT_SYMBOL_GPL(static_key_enabled); static int jump_label_cmp(const void *a, const void *b) { @@ -58,22 +59,26 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); } -static void jump_label_update(struct jump_label_key *key, int enable); +static void jump_label_update(struct static_key *key, int enable); -void jump_label_inc(struct jump_label_key *key) +void static_key_slow_inc(struct static_key *key) { if (atomic_inc_not_zero(&key->enabled)) return; jump_label_lock(); - if (atomic_read(&key->enabled) == 0) - jump_label_update(key, JUMP_LABEL_ENABLE); + if (atomic_read(&key->enabled) == 0) { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_ENABLE); + else + jump_label_update(key, JUMP_LABEL_DISABLE); + } atomic_inc(&key->enabled); jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_inc); +EXPORT_SYMBOL_GPL(static_key_slow_inc); -static void __jump_label_dec(struct jump_label_key *key, +static void __static_key_slow_dec(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) { @@ -85,32 +90,35 @@ static void __jump_label_dec(struct jump_label_key *key, if (rate_limit) { atomic_inc(&key->enabled); schedule_delayed_work(work, rate_limit); - } else - jump_label_update(key, JUMP_LABEL_DISABLE); - + } else { + if (!jump_label_get_branch_default(key)) + jump_label_update(key, JUMP_LABEL_DISABLE); + else + jump_label_update(key, JUMP_LABEL_ENABLE); + } jump_label_unlock(); } -EXPORT_SYMBOL_GPL(jump_label_dec); static void jump_label_update_timeout(struct work_struct *work) { - struct jump_label_key_deferred *key = - container_of(work, struct jump_label_key_deferred, work.work); - __jump_label_dec(&key->key, 0, NULL); + struct static_key_deferred *key = + container_of(work, struct static_key_deferred, work.work); + __static_key_slow_dec(&key->key, 0, NULL); } -void jump_label_dec(struct jump_label_key *key) +void static_key_slow_dec(struct static_key *key) { - __jump_label_dec(key, 0, NULL); + __static_key_slow_dec(key, 0, NULL); } +EXPORT_SYMBOL_GPL(static_key_slow_dec); -void jump_label_dec_deferred(struct jump_label_key_deferred *key) +void static_key_slow_dec_deferred(struct static_key_deferred *key) { - __jump_label_dec(&key->key, key->timeout, &key->work); + __static_key_slow_dec(&key->key, key->timeout, &key->work); } +EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred); - -void jump_label_rate_limit(struct jump_label_key_deferred *key, +void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { key->timeout = rl; @@ -153,7 +161,7 @@ void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry arch_jump_label_transform(entry, type); } -static void __jump_label_update(struct jump_label_key *key, +static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, int enable) { @@ -170,27 +178,40 @@ static void __jump_label_update(struct jump_label_key *key, } } +static enum jump_label_type jump_label_type(struct static_key *key) +{ + bool true_branch = jump_label_get_branch_default(key); + bool state = static_key_enabled(key); + + if ((!true_branch && state) || (true_branch && !state)) + return JUMP_LABEL_ENABLE; + + return JUMP_LABEL_DISABLE; +} + void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; - struct jump_label_key *key = NULL; + struct static_key *key = NULL; struct jump_entry *iter; jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; + struct static_key *iterk; - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + iterk = (struct static_key *)(unsigned long)iter->key; + arch_jump_label_transform_static(iter, jump_label_type(iterk)); if (iterk == key) continue; key = iterk; - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; #ifdef CONFIG_MODULES key->next = NULL; #endif @@ -200,8 +221,8 @@ void __init jump_label_init(void) #ifdef CONFIG_MODULES -struct jump_label_mod { - struct jump_label_mod *next; +struct static_key_mod { + struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; @@ -221,9 +242,9 @@ static int __jump_label_mod_text_reserved(void *start, void *end) start, end); } -static void __jump_label_mod_update(struct jump_label_key *key, int enable) +static void __jump_label_mod_update(struct static_key *key, int enable) { - struct jump_label_mod *mod = key->next; + struct static_key_mod *mod = key->next; while (mod) { struct module *m = mod->mod; @@ -254,11 +275,7 @@ void jump_label_apply_nops(struct module *mod) return; for (iter = iter_start; iter < iter_stop; iter++) { - struct jump_label_key *iterk; - - iterk = (struct jump_label_key *)(unsigned long)iter->key; - arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ? - JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE); + arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); } } @@ -267,8 +284,8 @@ static int jump_label_add_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm; + struct static_key *key = NULL; + struct static_key_mod *jlm; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) @@ -277,28 +294,30 @@ static int jump_label_add_module(struct module *mod) jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { - if (iter->key == (jump_label_t)(unsigned long)key) - continue; + struct static_key *iterk; - key = (struct jump_label_key *)(unsigned long)iter->key; + iterk = (struct static_key *)(unsigned long)iter->key; + if (iterk == key) + continue; + key = iterk; if (__module_address(iter->key) == mod) { - atomic_set(&key->enabled, 0); - key->entries = iter; + /* + * Set key->entries to iter, but preserve JUMP_LABEL_TRUE_BRANCH. + */ + *((unsigned long *)&key->entries) += (unsigned long)iter; key->next = NULL; continue; } - - jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; - jlm->mod = mod; jlm->entries = iter; jlm->next = key->next; key->next = jlm; - if (jump_label_enabled(key)) + if (jump_label_type(key) == JUMP_LABEL_ENABLE) __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); } @@ -310,14 +329,14 @@ static void jump_label_del_module(struct module *mod) struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; - struct jump_label_key *key = NULL; - struct jump_label_mod *jlm, **prev; + struct static_key *key = NULL; + struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (iter->key == (jump_label_t)(unsigned long)key) continue; - key = (struct jump_label_key *)(unsigned long)iter->key; + key = (struct static_key *)(unsigned long)iter->key; if (__module_address(iter->key) == mod) continue; @@ -419,9 +438,10 @@ int jump_label_text_reserved(void *start, void *end) return ret; } -static void jump_label_update(struct jump_label_key *key, int enable) +static void jump_label_update(struct static_key *key, int enable) { - struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + struct jump_entry *stop = __stop___jump_table; + struct jump_entry *entry = jump_label_get_entries(key); #ifdef CONFIG_MODULES struct module *mod = __module_address((unsigned long)key); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5255c9d2e053..112c6824476b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -162,13 +162,13 @@ static int sched_feat_show(struct seq_file *m, void *v) #ifdef HAVE_JUMP_LABEL -#define jump_label_key__true jump_label_key_enabled -#define jump_label_key__false jump_label_key_disabled +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE #define SCHED_FEAT(name, enabled) \ jump_label_key__##enabled , -struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { #include "features.h" }; @@ -176,14 +176,14 @@ struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = { static void sched_feat_disable(int i) { - if (jump_label_enabled(&sched_feat_keys[i])) - jump_label_dec(&sched_feat_keys[i]); + if (static_key_enabled(&sched_feat_keys[i])) + static_key_slow_dec(&sched_feat_keys[i]); } static void sched_feat_enable(int i) { - if (!jump_label_enabled(&sched_feat_keys[i])) - jump_label_inc(&sched_feat_keys[i]); + if (!static_key_enabled(&sched_feat_keys[i])) + static_key_slow_inc(&sched_feat_keys[i]); } #else static void sched_feat_disable(int i) { }; @@ -894,7 +894,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= irq_delta; #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - if (static_branch((¶virt_steal_rq_enabled))) { + if (static_key_false((¶virt_steal_rq_enabled))) { u64 st; steal = paravirt_steal_clock(cpu_of(rq)); @@ -2756,7 +2756,7 @@ void account_idle_time(cputime_t cputime) static __always_inline bool steal_account_process_tick(void) { #ifdef CONFIG_PARAVIRT - if (static_branch(¶virt_steal_enabled)) { + if (static_key_false(¶virt_steal_enabled)) { u64 steal, st = 0; steal = paravirt_steal_clock(smp_processor_id()); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7c6414fc669d..423547ada38a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1399,20 +1399,20 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_CFS_BANDWIDTH #ifdef HAVE_JUMP_LABEL -static struct jump_label_key __cfs_bandwidth_used; +static struct static_key __cfs_bandwidth_used; static inline bool cfs_bandwidth_used(void) { - return static_branch(&__cfs_bandwidth_used); + return static_key_false(&__cfs_bandwidth_used); } void account_cfs_bandwidth_used(int enabled, int was_enabled) { /* only need to count groups transitioning between enabled/!enabled */ if (enabled && !was_enabled) - jump_label_inc(&__cfs_bandwidth_used); + static_key_slow_inc(&__cfs_bandwidth_used); else if (!enabled && was_enabled) - jump_label_dec(&__cfs_bandwidth_used); + static_key_slow_dec(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..b4cd6d8ea150 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -611,7 +611,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ #ifdef CONFIG_SCHED_DEBUG -# include +# include # define const_debug __read_mostly #else # define const_debug const @@ -630,18 +630,18 @@ enum { #undef SCHED_FEAT #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) -static __always_inline bool static_branch__true(struct jump_label_key *key) +static __always_inline bool static_branch__true(struct static_key *key) { - return likely(static_branch(key)); /* Not out of line branch. */ + return static_key_true(key); /* Not out of line branch. */ } -static __always_inline bool static_branch__false(struct jump_label_key *key) +static __always_inline bool static_branch__false(struct static_key *key) { - return unlikely(static_branch(key)); /* Out of line branch. */ + return static_key_false(key); /* Out of line branch. */ } #define SCHED_FEAT(name, enabled) \ -static __always_inline bool static_branch_##name(struct jump_label_key *key) \ +static __always_inline bool static_branch_##name(struct static_key *key) \ { \ return static_branch__##enabled(key); \ } @@ -650,7 +650,7 @@ static __always_inline bool static_branch_##name(struct jump_label_key *key) \ #undef SCHED_FEAT -extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR]; +extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x])) #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */ #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index f1539decd99d..d96ba22dabfa 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include extern struct tracepoint * const __start___tracepoints_ptrs[]; extern struct tracepoint * const __stop___tracepoints_ptrs[]; @@ -256,9 +256,9 @@ static void set_tracepoint(struct tracepoint_entry **entry, { WARN_ON(strcmp((*entry)->name, elem->name) != 0); - if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + if (elem->regfunc && !static_key_enabled(&elem->key) && active) elem->regfunc(); - else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + else if (elem->unregfunc && static_key_enabled(&elem->key) && !active) elem->unregfunc(); /* @@ -269,10 +269,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - if (active && !jump_label_enabled(&elem->key)) - jump_label_inc(&elem->key); - else if (!active && jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (active && !static_key_enabled(&elem->key)) + static_key_slow_inc(&elem->key); + else if (!active && static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); } /* @@ -283,11 +283,11 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ static void disable_tracepoint(struct tracepoint *elem) { - if (elem->unregfunc && jump_label_enabled(&elem->key)) + if (elem->unregfunc && static_key_enabled(&elem->key)) elem->unregfunc(); - if (jump_label_enabled(&elem->key)) - jump_label_dec(&elem->key); + if (static_key_enabled(&elem->key)) + static_key_slow_dec(&elem->key); rcu_assign_pointer(elem->funcs, NULL); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1d985d..da7ce7f0e566 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -134,7 +134,7 @@ #include #include #include -#include +#include #include #include "net-sysfs.h" @@ -1441,11 +1441,11 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev) } EXPORT_SYMBOL(call_netdevice_notifiers); -static struct jump_label_key netstamp_needed __read_mostly; +static struct static_key netstamp_needed __read_mostly; #ifdef HAVE_JUMP_LABEL -/* We are not allowed to call jump_label_dec() from irq context +/* We are not allowed to call static_key_slow_dec() from irq context * If net_disable_timestamp() is called from irq context, defer the - * jump_label_dec() calls. + * static_key_slow_dec() calls. */ static atomic_t netstamp_needed_deferred; #endif @@ -1457,12 +1457,12 @@ void net_enable_timestamp(void) if (deferred) { while (--deferred) - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); return; } #endif WARN_ON(in_interrupt()); - jump_label_inc(&netstamp_needed); + static_key_slow_inc(&netstamp_needed); } EXPORT_SYMBOL(net_enable_timestamp); @@ -1474,19 +1474,19 @@ void net_disable_timestamp(void) return; } #endif - jump_label_dec(&netstamp_needed); + static_key_slow_dec(&netstamp_needed); } EXPORT_SYMBOL(net_disable_timestamp); static inline void net_timestamp_set(struct sk_buff *skb) { skb->tstamp.tv64 = 0; - if (static_branch(&netstamp_needed)) + if (static_key_false(&netstamp_needed)) __net_timestamp(skb); } #define net_timestamp_check(COND, SKB) \ - if (static_branch(&netstamp_needed)) { \ + if (static_key_false(&netstamp_needed)) { \ if ((COND) && !(SKB)->tstamp.tv64) \ __net_timestamp(SKB); \ } \ @@ -2660,7 +2660,7 @@ EXPORT_SYMBOL(__skb_get_rxhash); struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; EXPORT_SYMBOL(rps_sock_flow_table); -struct jump_label_key rps_needed __read_mostly; +struct static_key rps_needed __read_mostly; static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, @@ -2945,7 +2945,7 @@ int netif_rx(struct sk_buff *skb) trace_netif_rx(skb); #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu; @@ -3309,7 +3309,7 @@ int netif_receive_skb(struct sk_buff *skb) return NET_RX_SUCCESS; #ifdef CONFIG_RPS - if (static_branch(&rps_needed)) { + if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; int cpu, ret; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a1727cda03d7..495586232aa1 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -608,10 +608,10 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, spin_unlock(&rps_map_lock); if (map) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (old_map) { kfree_rcu(old_map, rcu); - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); } free_cpumask_var(mask); return len; diff --git a/net/core/sock.c b/net/core/sock.c index 3e81fd2e3c75..3a4e5817a2a7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -111,7 +111,7 @@ #include #include #include -#include +#include #include #include @@ -184,7 +184,7 @@ void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss) static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -struct jump_label_key memcg_socket_limit_enabled; +struct static_key memcg_socket_limit_enabled; EXPORT_SYMBOL(memcg_socket_limit_enabled); /* diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d05559d4d9cd..0c2850874254 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -69,9 +69,9 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write, if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) - jump_label_inc(&rps_needed); + static_key_slow_inc(&rps_needed); if (orig_sock_table) { - jump_label_dec(&rps_needed); + static_key_slow_dec(&rps_needed); synchronize_rcu(); vfree(orig_sock_table); } diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 49978788a9dc..602fb305365f 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -111,7 +111,7 @@ void tcp_destroy_cgroup(struct cgroup *cgrp, struct cgroup_subsys *ss) val = res_counter_read_u64(&tcp->tcp_memory_allocated, RES_LIMIT); if (val != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); } EXPORT_SYMBOL(tcp_destroy_cgroup); @@ -143,9 +143,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) net->ipv4.sysctl_tcp_mem[i]); if (val == RESOURCE_MAX && old_lim != RESOURCE_MAX) - jump_label_dec(&memcg_socket_limit_enabled); + static_key_slow_dec(&memcg_socket_limit_enabled); else if (old_lim == RESOURCE_MAX && val != RESOURCE_MAX) - jump_label_inc(&memcg_socket_limit_enabled); + static_key_slow_inc(&memcg_socket_limit_enabled); return 0; } diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b4e8ff05b301..e1b7e051332e 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -56,7 +56,7 @@ struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); #if defined(CONFIG_JUMP_LABEL) -struct jump_label_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; +struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif @@ -77,7 +77,7 @@ int nf_register_hook(struct nf_hook_ops *reg) list_add_rcu(®->list, elem->list.prev); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_inc(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif return 0; } @@ -89,7 +89,7 @@ void nf_unregister_hook(struct nf_hook_ops *reg) list_del_rcu(®->list); mutex_unlock(&nf_hook_mutex); #if defined(CONFIG_JUMP_LABEL) - jump_label_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); + static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif synchronize_net(); } -- cgit v1.2.3 From abd2363f6a5f1030b935e0bdc15cf917313b3b10 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Fri, 24 Feb 2012 08:07:06 -0700 Subject: irq_domain/mips: Allow irq_domain on MIPS This patch makes IRQ_DOMAIN usable on MIPS. It uses an ugly workaround to preserve current behaviour so that MIPS has time to add irq_domain registration to the irq controller drivers. The workaround will be removed in Linux v3.6 Signed-off-by: Grant Likely Cc: Ralf Baechle Cc: Rob Herring Cc: Thomas Gleixner Cc: linux-mips@linux-mips.org --- arch/mips/Kconfig | 1 + arch/mips/include/asm/irq.h | 5 +---- arch/mips/kernel/prom.c | 14 -------------- kernel/irq/irqdomain.c | 12 ++++++++++++ 4 files changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 5ab6e89603c5..edbbae17e820 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2327,6 +2327,7 @@ config USE_OF bool "Flattened Device Tree support" select OF select OF_EARLY_FLATTREE + select IRQ_DOMAIN help Include support for flattened device tree machine descriptions. diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index 2354c870a63a..fb698dc09bc9 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -11,15 +11,12 @@ #include #include +#include #include #include -static inline void irq_dispose_mapping(unsigned int virq) -{ -} - #ifdef CONFIG_I8259 static inline int irq_canonicalize(int irq) { diff --git a/arch/mips/kernel/prom.c b/arch/mips/kernel/prom.c index 6b8b4208481e..558b5395795d 100644 --- a/arch/mips/kernel/prom.c +++ b/arch/mips/kernel/prom.c @@ -60,20 +60,6 @@ void __init early_init_dt_setup_initrd_arch(unsigned long start, } #endif -/* - * irq_create_of_mapping - Hook to resolve OF irq specifier into a Linux irq# - * - * Currently the mapping mechanism is trivial; simple flat hwirq numbers are - * mapped 1:1 onto Linux irq numbers. Cascaded irq controllers are not - * supported. - */ -unsigned int irq_create_of_mapping(struct device_node *controller, - const u32 *intspec, unsigned int intsize) -{ - return intspec[0]; -} -EXPORT_SYMBOL_GPL(irq_create_of_mapping); - void __init early_init_devtree(void *params) { /* Setup flat device-tree pointer */ diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 25a498eb98a3..af48e59bc2ff 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -411,6 +411,18 @@ unsigned int irq_create_of_mapping(struct device_node *controller, domain = controller ? irq_find_host(controller) : irq_default_domain; if (!domain) { +#ifdef CONFIG_MIPS + /* + * Workaround to avoid breaking interrupt controller drivers + * that don't yet register an irq_domain. This is temporary + * code. ~~~gcl, Feb 24, 2012 + * + * Scheduled for removal in Linux v3.6. That should be enough + * time. + */ + if (intsize > 0) + return intspec[0]; +#endif printk(KERN_WARNING "irq: no irq domain found for %s !\n", controller->full_name); return 0; -- cgit v1.2.3 From d80e731ecab420ddcb79ee9d0ac427acbc187b4b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 24 Feb 2012 20:07:11 +0100 Subject: epoll: introduce POLLFREE to flush ->signalfd_wqh before kfree() This patch is intentionally incomplete to simplify the review. It ignores ep_unregister_pollwait() which plays with the same wqh. See the next change. epoll assumes that the EPOLL_CTL_ADD'ed file controls everything f_op->poll() needs. In particular it assumes that the wait queue can't go away until eventpoll_release(). This is not true in case of signalfd, the task which does EPOLL_CTL_ADD uses its ->sighand which is not connected to the file. This patch adds the special event, POLLFREE, currently only for epoll. It expects that init_poll_funcptr()'ed hook should do the necessary cleanup. Perhaps it should be defined as EPOLLFREE in eventpoll. __cleanup_sighand() is changed to do wake_up_poll(POLLFREE) if ->signalfd_wqh is not empty, we add the new signalfd_cleanup() helper. ep_poll_callback(POLLFREE) simply does list_del_init(task_list). This make this poll entry inconsistent, but we don't care. If you share epoll fd which contains our sigfd with another process you should blame yourself. signalfd is "really special". I simply do not know how we can define the "right" semantics if it used with epoll. The main problem is, epoll calls signalfd_poll() once to establish the connection with the wait queue, after that signalfd_poll(NULL) returns the different/inconsistent results depending on who does EPOLL_CTL_MOD/signalfd_read/etc. IOW: apart from sigmask, signalfd has nothing to do with the file, it works with the current thread. In short: this patch is the hack which tries to fix the symptoms. It also assumes that nobody can take tasklist_lock under epoll locks, this seems to be true. Note: - we do not have wake_up_all_poll() but wake_up_poll() is fine, poll/epoll doesn't use WQ_FLAG_EXCLUSIVE. - signalfd_cleanup() uses POLLHUP along with POLLFREE, we need a couple of simple changes in eventpoll.c to make sure it can't be "lost". Reported-by: Maxime Bizon Cc: Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++++ fs/signalfd.c | 11 +++++++++++ include/asm-generic/poll.h | 2 ++ include/linux/signalfd.h | 5 ++++- kernel/fork.c | 5 ++++- 5 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index aabdfc38cf24..34bbfc6dd8dc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -842,6 +842,10 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + /* the caller holds eppoll_entry->whead->lock */ + if ((unsigned long)key & POLLFREE) + list_del_init(&wait->task_list); + spin_lock_irqsave(&ep->lock, flags); /* diff --git a/fs/signalfd.c b/fs/signalfd.c index 492465b451dd..79c1eea98a3a 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -30,6 +30,17 @@ #include #include +void signalfd_cleanup(struct sighand_struct *sighand) +{ + wait_queue_head_t *wqh = &sighand->signalfd_wqh; + + if (likely(!waitqueue_active(wqh))) + return; + + /* wait_queue_t->func(POLLFREE) should do remove_wait_queue() */ + wake_up_poll(wqh, POLLHUP | POLLFREE); +} + struct signalfd_ctx { sigset_t sigmask; }; diff --git a/include/asm-generic/poll.h b/include/asm-generic/poll.h index 44bce836d350..9ce7f44aebd2 100644 --- a/include/asm-generic/poll.h +++ b/include/asm-generic/poll.h @@ -28,6 +28,8 @@ #define POLLRDHUP 0x2000 #endif +#define POLLFREE 0x4000 /* currently only for epoll */ + struct pollfd { int fd; short events; diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index 3ff4961da9b5..247399b2979a 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -61,13 +61,16 @@ static inline void signalfd_notify(struct task_struct *tsk, int sig) wake_up(&tsk->sighand->signalfd_wqh); } +extern void signalfd_cleanup(struct sighand_struct *sighand); + #else /* CONFIG_SIGNALFD */ static inline void signalfd_notify(struct task_struct *tsk, int sig) { } +static inline void signalfd_cleanup(struct sighand_struct *sighand) { } + #endif /* CONFIG_SIGNALFD */ #endif /* __KERNEL__ */ #endif /* _LINUX_SIGNALFD_H */ - diff --git a/kernel/fork.c b/kernel/fork.c index b77fd559c78e..e2cd3e2a5ae8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -935,8 +936,10 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) void __cleanup_sighand(struct sighand_struct *sighand) { - if (atomic_dec_and_test(&sighand->count)) + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); kmem_cache_free(sighand_cachep, sighand); + } } -- cgit v1.2.3 From 8f2f748b0656257153bcf0941df8d6060acc5ca6 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 23 Feb 2012 15:27:15 +0530 Subject: CPU hotplug, cpusets, suspend: Don't touch cpusets during suspend/resume Currently, during CPU hotplug, the cpuset callbacks modify the cpusets to reflect the state of the system, and this handling is asymmetric. That is, upon CPU offline, that CPU is removed from all cpusets. However when it comes back online, it is put back only to the root cpuset. This gives rise to a significant problem during suspend/resume. During suspend, we offline all non-boot cpus and during resume we online them back. Which means, after a resume, all cpusets (except the root cpuset) will be restricted to just one single CPU (the boot cpu). But the whole point of suspend/resume is to restore the system to a state which is as close as possible to how it was before suspend. So to fix this, don't touch cpusets during suspend/resume. That is, modify the cpuset-related CPU hotplug callback to just ignore CPU hotplug when it is initiated as part of the suspend/resume sequence. Reported-by: Prashanth Nageshappa Signed-off-by: Srivatsa S. Bhat Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/4F460D7B.1020703@linux.vnet.ibm.com Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b342f57879e6..33a0676ea744 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6728,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev) static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); @@ -6741,7 +6741,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action & ~CPU_TASKS_FROZEN) { + switch (action) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); return NOTIFY_OK; -- cgit v1.2.3 From 8c9cf542b8a66c231747a550573d910daf17f0e9 Mon Sep 17 00:00:00 2001 From: Gerlando Falauto Date: Mon, 27 Feb 2012 09:08:21 +0100 Subject: tracing: Do not select FRAME_POINTER on PPC On PowerPC, FUNCTION_TRACER selects FRAME_POINTER, even though the architecture does not support it. This causes the following warning: warning: (LOCKDEP && FAULT_INJECTION_STACKTRACE_FILTER && LATENCYTOP && FUNCTION_TRACER && KMEMCHECK) selects FRAME_POINTER which has unmet direct dependencies (DEBUG_KERNEL && (CRIS || M68K || FRV || UML || AVR32 || SUPERH || BLACKFIN || MN10300) || ARCH_WANT_FRAME_POINTERS) So remove the warning by adding the extra condition "if !PPC" to FUNCTION_TRACER for FRAME_POINTER selection Link: http://lkml.kernel.org/r/1330330101-8618-1-git-send-email-gerlando.falauto@keymile.com Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Gerlando Falauto Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index cd3134510f3d..a1d2849f2473 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -141,7 +141,7 @@ if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" depends on HAVE_FUNCTION_TRACER - select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE + select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER -- cgit v1.2.3 From 30ce2f7eef095d1b8d070740f1948629814fe3c7 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 28 Feb 2012 10:19:38 +0900 Subject: perf/hwbp: Fix a possible memory leak If kzalloc() for TYPE_DATA failed on a given cpu, previous chunk of TYPE_INST will be leaked. Fix it. Thanks to Peter Zijlstra for suggesting this better solution. It should work as long as the initial value of the region is all 0's and that's the case of static (per-cpu) memory allocation. Signed-off-by: Namhyung Kim Acked-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/1330391978-28070-1-git-send-email-namhyung.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index b7971d6f38bf..ee706ce44aa0 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -651,10 +651,10 @@ int __init init_hw_breakpoint(void) err_alloc: for_each_possible_cpu(err_cpu) { - if (err_cpu == cpu) - break; for (i = 0; i < TYPE_MAX; i++) kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + if (err_cpu == cpu) + break; } return -ENOMEM; -- cgit v1.2.3 From 8eedce996556d7d06522cd3a0e6069141c8dffe0 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Tue, 28 Feb 2012 13:49:01 -0500 Subject: static keys: Inline the static_key_enabled() function In the jump label enabled case, calling static_key_enabled() results in a function call. The function returns the results of a compare, so it really doesn't need the overhead of a full function call. Let's make it 'static inline' for both the jump label enabled and disabled cases. Signed-off-by: Jason Baron Cc: a.p.zijlstra@chello.nl Cc: rostedt@goodmis.org Cc: mathieu.desnoyers@polymtl.ca Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/201202281849.q1SIn1p2023270@int-mx10.intmail.prod.int.phx2.redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 11 +++++------ kernel/jump_label.c | 6 ------ 2 files changed, 5 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 2172da2d9bb4..c513a40510f5 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -127,7 +127,6 @@ extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); extern void static_key_slow_dec_deferred(struct static_key_deferred *key); -extern bool static_key_enabled(struct static_key *key); extern void jump_label_apply_nops(struct module *mod); extern void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); @@ -198,11 +197,6 @@ static inline int jump_label_text_reserved(void *start, void *end) static inline void jump_label_lock(void) {} static inline void jump_label_unlock(void) {} -static inline bool static_key_enabled(struct static_key *key) -{ - return (atomic_read(&key->enabled) > 0); -} - static inline int jump_label_apply_nops(struct module *mod) { return 0; @@ -224,4 +218,9 @@ jump_label_rate_limit(struct static_key_deferred *key, #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE #define jump_label_enabled static_key_enabled +static inline bool static_key_enabled(struct static_key *key) +{ + return (atomic_read(&key->enabled) > 0); +} + #endif /* _LINUX_JUMP_LABEL_H */ diff --git a/kernel/jump_label.c b/kernel/jump_label.c index bf9dcadbb53a..43049192b5ec 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -29,12 +29,6 @@ void jump_label_unlock(void) mutex_unlock(&jump_label_mutex); } -bool static_key_enabled(struct static_key *key) -{ - return (atomic_read(&key->enabled) > 0); -} -EXPORT_SYMBOL_GPL(static_key_enabled); - static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; -- cgit v1.2.3 From 13ae246db4a02971ef4f557af1f6d3e21d64b710 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Sun, 29 Jan 2012 15:44:45 -0500 Subject: includecheck: delete any duplicate instances of module.h Different tree maintainers picked up independently generated trivial compile fixes based on linux-next testing, resulting in some cases where a file would have got more than one addition of module.h once everything was all merged together. Delete any duplicates so includecheck isn't complaining about anything related to module.h/export.h changes. Signed-off-by: Paul Gortmaker --- arch/blackfin/mach-bf537/boards/pnav10.c | 1 - drivers/dma/imx-dma.c | 1 - drivers/dma/imx-sdma.c | 1 - drivers/media/video/adp1653.c | 1 - drivers/mmc/host/sdhci-tegra.c | 1 - drivers/power/max8998_charger.c | 1 - drivers/staging/iio/dac/ad5686.c | 1 - drivers/staging/iio/gyro/adis16060_core.c | 1 - drivers/staging/sm7xx/smtcfb.c | 1 - drivers/usb/dwc3/core.c | 1 - drivers/usb/dwc3/dwc3-omap.c | 1 - kernel/params.c | 1 - 12 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/mach-bf537/boards/pnav10.c b/arch/blackfin/mach-bf537/boards/pnav10.c index 6fd84709fc68..7d15a3024e48 100644 --- a/arch/blackfin/mach-bf537/boards/pnav10.c +++ b/arch/blackfin/mach-bf537/boards/pnav10.c @@ -101,7 +101,6 @@ static struct platform_device smc91x_device = { #if defined(CONFIG_BFIN_MAC) || defined(CONFIG_BFIN_MAC_MODULE) #include -#include static const unsigned short bfin_mac_peripherals[] = P_RMII0; static struct bfin_phydev_platform_data bfin_phydev_data[] = { diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c index e4383ee2c9ac..38586ba8da91 100644 --- a/drivers/dma/imx-dma.c +++ b/drivers/dma/imx-dma.c @@ -14,7 +14,6 @@ * http://www.gnu.org/copyleft/gpl.html */ #include -#include #include #include #include diff --git a/drivers/dma/imx-sdma.c b/drivers/dma/imx-sdma.c index 8bc5acf36ee5..63540d3e2153 100644 --- a/drivers/dma/imx-sdma.c +++ b/drivers/dma/imx-sdma.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/video/adp1653.c b/drivers/media/video/adp1653.c index 12eedf4d515a..6e7d094fa2b6 100644 --- a/drivers/media/video/adp1653.c +++ b/drivers/media/video/adp1653.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 78a36eba4df0..cb348569454b 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -23,7 +23,6 @@ #include #include #include -#include #include diff --git a/drivers/power/max8998_charger.c b/drivers/power/max8998_charger.c index 9b3f2bf56e70..6dc01c255592 100644 --- a/drivers/power/max8998_charger.c +++ b/drivers/power/max8998_charger.c @@ -19,7 +19,6 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include diff --git a/drivers/staging/iio/dac/ad5686.c b/drivers/staging/iio/dac/ad5686.c index ce2d6193dd89..2415a6e60c77 100644 --- a/drivers/staging/iio/dac/ad5686.c +++ b/drivers/staging/iio/dac/ad5686.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "../iio.h" #include "../sysfs.h" diff --git a/drivers/staging/iio/gyro/adis16060_core.c b/drivers/staging/iio/gyro/adis16060_core.c index c0ca7093e0ed..02cc23420b90 100644 --- a/drivers/staging/iio/gyro/adis16060_core.c +++ b/drivers/staging/iio/gyro/adis16060_core.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "../iio.h" #include "../sysfs.h" diff --git a/drivers/staging/sm7xx/smtcfb.c b/drivers/staging/sm7xx/smtcfb.c index ae0035f327e7..1b3e2d0c6993 100644 --- a/drivers/staging/sm7xx/smtcfb.c +++ b/drivers/staging/sm7xx/smtcfb.c @@ -41,7 +41,6 @@ #ifdef CONFIG_PM #include -#include #endif #include "smtcfb.h" diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 7c9df630dbe4..eed4a5b6d394 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -51,7 +51,6 @@ #include #include -#include #include "core.h" #include "gadget.h" diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index 3274ac8f1200..92cc7b8bc091 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -46,7 +46,6 @@ #include #include #include -#include #include "core.h" #include "io.h" diff --git a/kernel/params.c b/kernel/params.c index 4bc965d8a1fe..47f5bf12434a 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -15,7 +15,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include #include #include #include -- cgit v1.2.3 From 42c62a589f1ccbf38a02cb732231f9c2fccc5ab0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2011 22:03:48 +0200 Subject: sched/rt: Keep period timer ticking when rt throttling is active When a runqueue is throttled we cannot disable the period timer because that timer is the only way to undo the throttling. We got stale throttling entries when a rq was throttled and then the global sysctl was disabled, which stopped the timer. Signed-off-by: Peter Zijlstra [ Added changelog ] Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-nuj34q52p6ro7szapuz84i0v@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f70206c2c802..6d1eb0be1870 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -778,12 +778,9 @@ static inline int balance_runtime(struct rt_rq *rt_rq) static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) { - int i, idle = 1; + int i, idle = 1, throttled = 0; const struct cpumask *span; - if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) - return 1; - span = sched_rt_period_mask(); for_each_cpu(i, span) { int enqueue = 0; @@ -818,12 +815,17 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) if (!rt_rq_throttled(rt_rq)) enqueue = 1; } + if (rt_rq->rt_throttled) + throttled = 1; if (enqueue) sched_rt_rq_enqueue(rt_rq); raw_spin_unlock(&rq->lock); } + if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)) + return 1; + return idle; } @@ -884,7 +886,8 @@ static void update_curr_rt(struct rq *rq) if (unlikely((s64)delta_exec < 0)) delta_exec = 0; - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + schedstat_set(curr->se.statistics.exec_max, + max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); -- cgit v1.2.3 From 7abc63b1bd412f7655b62ef3e35c3c11c5134636 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Oct 2011 22:03:48 +0200 Subject: sched/rt: Do not throttle when PI boosting When a runqueue has rt_runtime_us = 0 then the only way it can accumulate rt_time is via PI boosting. That causes the runqueue to be throttled and replenishing does not change anything due to rt_runtime_us = 0. So avoid that situation by clearing rt_time and skip the throttling alltogether. Signed-off-by: Peter Zijlstra [ Changelog ] Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-7x70cypsotjb4jvcor3edctk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 6d1eb0be1870..7f7e7cdcb472 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -857,8 +857,24 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; if (rt_rq->rt_time > runtime) { - rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + /* + * Don't actually throttle groups that have no runtime assigned + * but accrue some time due to boosting. + */ + if (likely(rt_b->rt_runtime)) { + rt_rq->rt_throttled = 1; + printk_once(KERN_WARNING "sched: RT throttling activated\n"); + } else { + /* + * In case we did anyway, make it go away, + * replenishment is a joke, since it will replenish us + * with exactly 0 ns. + */ + rt_rq->rt_time = 0; + } + if (rt_rq_throttled(rt_rq)) { sched_rt_rq_dequeue(rt_rq); return 1; -- cgit v1.2.3 From c5491ea779793f977d282754db478157cc409d82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:09:35 +0100 Subject: sched/rt: Add schedule_preempt_disabled() Add helper to get rid of the ever repeating: preempt_enable_no_resched(); schedule(); preempt_disable(); patterns. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-wxx7btox7coby6ifv5vzhzgp@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1a6398424fab..03dd224d0667 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -361,6 +361,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); +extern void schedule_preempt_disabled(void); extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 91fe6a0e9098..73022395c00e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3246,6 +3246,18 @@ asmlinkage void __sched schedule(void) } EXPORT_SYMBOL(schedule); +/** + * schedule_preempt_disabled - called with preemption disabled + * + * Returns with preemption disabled. Note: preempt_count must be 1 + */ +void __sched schedule_preempt_disabled(void) +{ + preempt_enable_no_resched(); + schedule(); + preempt_disable(); +} + #ifdef CONFIG_MUTEX_SPIN_ON_OWNER static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -- cgit v1.2.3 From bd2f55361f18347e890d52ff9cfd8895455ec11b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 12:33:18 +0100 Subject: sched/rt: Use schedule_preempt_disabled() Coccinelle based conversion. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-24swm5zut3h9c4a6s46x8rws@git.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/kernel/process.c | 4 +--- arch/avr32/kernel/process.c | 4 +--- arch/blackfin/kernel/process.c | 4 +--- arch/cris/kernel/process.c | 4 +--- arch/frv/kernel/process.c | 4 +--- arch/h8300/kernel/process.c | 4 +--- arch/ia64/kernel/process.c | 4 +--- arch/m32r/kernel/process.c | 4 +--- arch/m68k/kernel/process_mm.c | 4 +--- arch/m68k/kernel/process_no.c | 4 +--- arch/microblaze/kernel/process.c | 4 +--- arch/mips/kernel/process.c | 4 +--- arch/mn10300/kernel/process.c | 4 +--- arch/parisc/kernel/process.c | 4 +--- arch/powerpc/kernel/idle.c | 8 ++++---- arch/powerpc/platforms/iseries/setup.c | 8 ++------ arch/s390/kernel/process.c | 4 +--- arch/score/kernel/process.c | 4 +--- arch/sh/kernel/idle.c | 4 +--- arch/sparc/kernel/process_32.c | 8 ++------ arch/sparc/kernel/process_64.c | 10 ++++------ arch/tile/kernel/process.c | 4 +--- arch/x86/kernel/process_32.c | 4 +--- arch/x86/kernel/process_64.c | 4 +--- arch/xtensa/kernel/process.c | 4 +--- init/main.c | 5 +---- kernel/mutex.c | 4 +--- kernel/softirq.c | 4 +--- 28 files changed, 36 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 971d65c253a9..c2ae3cd331fe 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -239,9 +239,7 @@ void cpu_idle(void) leds_event(led_idle_end); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index ea3395750324..92c5af98a6f7 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -40,9 +40,7 @@ void cpu_idle(void) cpu_idle_sleep(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 8dd0416673cb..a80a643f3691 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -94,9 +94,7 @@ void cpu_idle(void) idle(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index aa585e4e979e..d8f50ff6fadd 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -115,9 +115,7 @@ void cpu_idle (void) idle = default_idle; idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 3901df1213c0..29cc49783787 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -92,9 +92,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 933bd388efb2..1a173b35f475 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -81,9 +81,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 6d33c5cc94f0..9dc52b63fc87 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -330,9 +330,7 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); if (cpu_is_offline(cpu)) play_dead(); diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 422bea9f1dbc..3a4a32b27208 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -90,9 +90,7 @@ void cpu_idle (void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_mm.c b/arch/m68k/kernel/process_mm.c index 099283ee1a8f..fe4186b5fc32 100644 --- a/arch/m68k/kernel/process_mm.c +++ b/arch/m68k/kernel/process_mm.c @@ -78,9 +78,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/m68k/kernel/process_no.c b/arch/m68k/kernel/process_no.c index 5e1078cabe0e..f7fe6c348595 100644 --- a/arch/m68k/kernel/process_no.c +++ b/arch/m68k/kernel/process_no.c @@ -73,9 +73,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 7dcb5bfffb75..9155f7d92669 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -110,9 +110,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 7955409051c4..61f1cb45a1d5 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -80,9 +80,7 @@ void __noreturn cpu_idle(void) #endif rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 28eec3102535..cac401d37f75 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -123,9 +123,7 @@ void cpu_idle(void) idle(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 62c60b87d039..d4b94b395c16 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -71,9 +71,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 0a48bf5db6c8..65035141552b 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -101,11 +101,11 @@ void cpu_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - if (cpu_should_die()) + if (cpu_should_die()) { + preempt_enable_no_resched(); cpu_die(); - schedule(); - preempt_disable(); + } + schedule_preempt_disabled(); } } diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c index 8fc62586a973..a5fbf4cb6329 100644 --- a/arch/powerpc/platforms/iseries/setup.c +++ b/arch/powerpc/platforms/iseries/setup.c @@ -584,9 +584,7 @@ static void iseries_shared_idle(void) if (hvlpevent_is_pending()) process_iSeries_events(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } @@ -615,9 +613,7 @@ static void iseries_dedicated_idle(void) ppc64_runlatch_on(); rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index e795933eb2cb..7618085b4164 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -97,9 +97,7 @@ void cpu_idle(void) tick_nohz_idle_exit(); if (test_thread_flag(TIF_MCCK_PENDING)) s390_handle_mcck(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/score/kernel/process.c b/arch/score/kernel/process.c index 25d08030a883..2707023c7563 100644 --- a/arch/score/kernel/process.c +++ b/arch/score/kernel/process.c @@ -53,9 +53,7 @@ void __noreturn cpu_idle(void) while (!need_resched()) barrier(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 406508d4ce74..7e4892826563 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -114,9 +114,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index f793742eec2b..935fdbcd88c2 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } @@ -138,9 +136,7 @@ void cpu_idle(void) while (!need_resched()) cpu_relax(); } - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); check_pgt_cache(); } } diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 39d8b05201a2..ab9a29268213 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -104,15 +104,13 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - #ifdef CONFIG_HOTPLUG_CPU - if (cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) { + preempt_enable_no_resched(); cpu_play_dead(); + } #endif - - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index 4c1ac6e5347a..6ae495ef2b99 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -108,9 +108,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c08d1ff12b7c..49888fefe794 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -119,9 +119,7 @@ void cpu_idle(void) } rcu_idle_exit(); tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index cfa5c90c01db..e34257c70c28 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -156,9 +156,7 @@ void cpu_idle(void) } tick_nohz_idle_exit(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 47041e7c088c..2c9004770c4e 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -113,9 +113,7 @@ void cpu_idle(void) while (1) { while (!need_resched()) platform_idle(); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } } diff --git a/init/main.c b/init/main.c index ff49a6dacfbb..4990f7ec776a 100644 --- a/init/main.c +++ b/init/main.c @@ -374,11 +374,8 @@ static noinline void __init_refok rest_init(void) * at least once to get things moving: */ init_idle_bootup_task(current); - preempt_enable_no_resched(); - schedule(); - + schedule_preempt_disabled(); /* Call into cpu_idle with preempt disabled */ - preempt_disable(); cpu_idle(); } diff --git a/kernel/mutex.c b/kernel/mutex.c index 89096dd8786f..a307cc9c9526 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -240,9 +240,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..79b524767a24 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -744,9 +744,7 @@ static int run_ksoftirqd(void * __bind_cpu) while (!kthread_should_stop()) { preempt_disable(); if (!local_softirq_pending()) { - preempt_enable_no_resched(); - schedule(); - preempt_disable(); + schedule_preempt_disabled(); } __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From ba74c1448f127649046615ec017bded7b2a76f29 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Mar 2011 13:32:17 +0100 Subject: sched/rt: Document scheduler related skip-resched-check sites Create a distinction between scheduler related preempt_enable_no_resched() calls and the nearly one hundred other places in the kernel that do not want to reschedule, for one reason or another. This distinction matters for -rt, where the scheduler and the non-scheduler preempt models (and checks) are different. For upstream it's purely documentational. Signed-off-by: Thomas Gleixner Link: http://lkml.kernel.org/n/tip-gs88fvx2mdv5psnzxnv575ke@git.kernel.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/idle.c | 2 +- arch/sparc/kernel/process_64.c | 2 +- include/linux/preempt.h | 5 ++++- kernel/sched/core.c | 6 +++--- kernel/softirq.c | 4 ++-- 5 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index 65035141552b..c97fc60c790c 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -102,7 +102,7 @@ void cpu_idle(void) rcu_idle_exit(); tick_nohz_idle_exit(); if (cpu_should_die()) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cpu_die(); } schedule_preempt_disabled(); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index ab9a29268213..06b5b5fc20c7 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -106,7 +106,7 @@ void cpu_idle(void) #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(cpu)) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cpu_play_dead(); } #endif diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 58969b2a8a82..5a710b9c578e 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -48,12 +48,14 @@ do { \ barrier(); \ } while (0) -#define preempt_enable_no_resched() \ +#define sched_preempt_enable_no_resched() \ do { \ barrier(); \ dec_preempt_count(); \ } while (0) +#define preempt_enable_no_resched() sched_preempt_enable_no_resched() + #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -92,6 +94,7 @@ do { \ #else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) +#define sched_preempt_enable_no_resched() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 73022395c00e..643cc37fcb23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3220,7 +3220,7 @@ need_resched: post_schedule(rq); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); if (need_resched()) goto need_resched; } @@ -3253,7 +3253,7 @@ EXPORT_SYMBOL(schedule); */ void __sched schedule_preempt_disabled(void) { - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); preempt_disable(); } @@ -4486,7 +4486,7 @@ SYSCALL_DEFINE0(sched_yield) __release(rq->lock); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); do_raw_spin_unlock(&rq->lock); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); schedule(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 79b524767a24..f268369ebe1f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -353,7 +353,7 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); } /* @@ -759,7 +759,7 @@ static int run_ksoftirqd(void * __bind_cpu) if (local_softirq_pending()) __do_softirq(); local_irq_enable(); - preempt_enable_no_resched(); + sched_preempt_enable_no_resched(); cond_resched(); preempt_disable(); rcu_note_context_switch((long)__bind_cpu); -- cgit v1.2.3 From 63b2001169e75cd71e917ec953fdab572e3f944a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 1 Dec 2011 00:04:00 +0100 Subject: sched/wait: Add __wake_up_all_locked() API For code which protects the waitqueue itself with another lock it makes no sense to acquire the waitqueue lock for wakeup all. Provide __wake_up_all_locked(). This is an optimization on the vanilla kernel (to be used by the PCI code) and an important semantic distinction on -rt. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ux6m4b8jonb9inx8xafh77ds@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/wait.h | 5 +++-- kernel/sched/core.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index a9ce45e8501c..7d9a9e990ce6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -157,7 +157,7 @@ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, void *key); -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode); +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr); void __wake_up_bit(wait_queue_head_t *, void *, int); int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned); @@ -170,7 +170,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); #define wake_up(x) __wake_up(x, TASK_NORMAL, 1, NULL) #define wake_up_nr(x, nr) __wake_up(x, TASK_NORMAL, nr, NULL) #define wake_up_all(x) __wake_up(x, TASK_NORMAL, 0, NULL) -#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL) +#define wake_up_locked(x) __wake_up_locked((x), TASK_NORMAL, 1) +#define wake_up_all_locked(x) __wake_up_locked((x), TASK_NORMAL, 0) #define wake_up_interruptible(x) __wake_up(x, TASK_INTERRUPTIBLE, 1, NULL) #define wake_up_interruptible_nr(x, nr) __wake_up(x, TASK_INTERRUPTIBLE, nr, NULL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 643cc37fcb23..820f7453fda9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3418,9 +3418,9 @@ EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ -void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr) { - __wake_up_common(q, mode, 1, 0, NULL); + __wake_up_common(q, mode, nr, 0, NULL); } EXPORT_SYMBOL_GPL(__wake_up_locked); -- cgit v1.2.3 From 1c4dd99bed5f6f70932bf8dacdd54d04a2619778 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 6 Jun 2011 20:07:38 +0200 Subject: sched/rt: Prevent idle task boosting Idle task boosting is a nono in general. There is one exception, when PREEMPT_RT and NOHZ is active: The idle task calls get_next_timer_interrupt() and holds the timer wheel base->lock on the CPU and another CPU wants to access the timer (probably to cancel it). We can safely ignore the boosting request, as the idle CPU runs this code with interrupts disabled and will complete the lock protected section without being interrupted. So there is no real need to boost. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-755rvsosz7sdzot12a3gbha6@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 820f7453fda9..c2bbdecaa27d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3779,6 +3779,24 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = __task_rq_lock(p); + /* + * Idle task boosting is a nono in general. There is one + * exception, when PREEMPT_RT and NOHZ is active: + * + * The idle task calls get_next_timer_interrupt() and holds + * the timer wheel base->lock on the CPU and another CPU wants + * to access the timer (probably to cancel it). We can safely + * ignore the boosting request, as the idle CPU runs this code + * with interrupts disabled and will complete the lock + * protected section without being interrupted. So there is no + * real need to boost. + */ + if (unlikely(p == rq->idle)) { + WARN_ON(p != rq->curr); + WARN_ON(p->pi_blocked_on); + goto out_unlock; + } + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; @@ -3802,11 +3820,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); check_class_changed(rq, p, prev_class, oldprio); +out_unlock: __task_rq_unlock(rq); } - #endif - void set_user_nice(struct task_struct *p, long nice) { int old_prio, delta, on_rq; -- cgit v1.2.3 From 3c7d51843b03a6839e9ec7cda724e54d2319a63a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 17 Jul 2011 20:46:52 +0200 Subject: sched/rt: Do not submit new work when PI-blocked When we are PI-blocked then we want to get things done ASAP. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-vw8et3445km5b8mpihf4trae@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++++++++ kernel/sched/core.c | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 03dd224d0667..c628a9151437 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2082,12 +2082,20 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice; extern int rt_mutex_getprio(struct task_struct *p); extern void rt_mutex_setprio(struct task_struct *p, int prio); extern void rt_mutex_adjust_pi(struct task_struct *p); +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return tsk->pi_blocked_on != NULL; +} #else static inline int rt_mutex_getprio(struct task_struct *p) { return p->normal_prio; } # define rt_mutex_adjust_pi(p) do { } while (0) +static inline bool tsk_is_pi_blocked(struct task_struct *tsk) +{ + return false; +} #endif extern bool yield_to(struct task_struct *p, bool preempt); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c2bbdecaa27d..25e06a5e048b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3227,7 +3227,7 @@ need_resched: static inline void sched_submit_work(struct task_struct *tsk) { - if (!tsk->state) + if (!tsk->state || tsk_is_pi_blocked(tsk)) return; /* * If we are going to sleep and we have plugged IO queued, -- cgit v1.2.3 From 8e45cb545d98bc58e75b7de89ec8d3e5c8459ee6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Feb 2012 12:47:19 +0100 Subject: sched: Move load-balancing arguments into helper struct Passing large sets of similar arguments all around the load-balancer gets tiresom when you want to modify something. Stick them all in a helper structure and pass the structure around. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-5slqz0vhsdzewrfk9eza1aon@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 177 +++++++++++++++++++++++++++------------------------- 1 file changed, 93 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79e9e13c31ab..55b1f117419a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3135,13 +3135,25 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) #define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ #define LBF_ABORT 0x10 +struct lb_env { + struct sched_domain *sd; + + int this_cpu; + struct rq *this_rq; + + struct rq *busiest_rq; + struct cfs_rq *busiest_cfs_rq; + + enum cpu_idle_type idle; + unsigned long max_load_move; + unsigned int flags; +}; + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static -int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +int can_migrate_task(struct task_struct *p, struct lb_env *env) { int tsk_cache_hot = 0; /* @@ -3150,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } - *lb_flags &= ~LBF_ALL_PINNED; + env->flags &= ~LBF_ALL_PINNED; - if (task_running(rq, p)) { + if (task_running(env->busiest_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3167,12 +3179,12 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, rq->clock_task, sd); + tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd); if (!tsk_cache_hot || - sd->nr_balance_failed > sd->cache_nice_tries) { + env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS if (tsk_cache_hot) { - schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } #endif @@ -3193,31 +3205,27 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * * Called with both runqueues locked. */ -static int -move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, - struct sched_domain *sd, enum cpu_idle_type idle) +static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; struct cfs_rq *cfs_rq; - int pinned = 0; - for_each_leaf_cfs_rq(busiest, cfs_rq) { + for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (throttled_lb_pair(task_group(p), - busiest->cpu, this_cpu)) + env->busiest_rq->cpu, env->this_cpu)) break; - if (!can_migrate_task(p, busiest, this_cpu, - sd, idle, &pinned)) + if (!can_migrate_task(p, env)) continue; - pull_task(busiest, p, this_rq, this_cpu); + pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); /* * Right now, this is only the second place pull_task() * is called, so we can safely collect pull_task() * stats here rather than inside pull_task(). */ - schedstat_inc(sd, lb_gained[idle]); + schedstat_inc(env->sd, lb_gained[env->idle]); return 1; } } @@ -3225,31 +3233,26 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -static unsigned long -balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, struct sched_domain *sd, - enum cpu_idle_type idle, int *lb_flags, - struct cfs_rq *busiest_cfs_rq) +static unsigned long balance_tasks(struct lb_env *env) { int loops = 0, pulled = 0; - long rem_load_move = max_load_move; + long rem_load_move = env->max_load_move; struct task_struct *p, *n; - if (max_load_move == 0) + if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { + list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) { - *lb_flags |= LBF_NEED_BREAK; + env->flags |= LBF_NEED_BREAK; break; } if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, busiest, this_cpu, sd, idle, - lb_flags)) + !can_migrate_task(p, env)) continue; - pull_task(busiest, p, this_rq, this_cpu); + pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); pulled++; rem_load_move -= p->se.load.weight; @@ -3259,8 +3262,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) { + env->flags |= LBF_ABORT; break; } #endif @@ -3278,9 +3281,9 @@ out: * so we can safely collect pull_task() stats here rather than * inside pull_task(). */ - schedstat_add(sd, lb_gained[idle], pulled); + schedstat_add(env->sd, lb_gained[env->idle], pulled); - return max_load_move - rem_load_move; + return env->max_load_move - rem_load_move; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3363,40 +3366,39 @@ static void update_h_load(long cpu) walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long load_balance_fair(struct lb_env *env) { - long rem_load_move = max_load_move; - struct cfs_rq *busiest_cfs_rq; + unsigned long max_load_move = env->max_load_move; + long rem_load_move = env->max_load_move; rcu_read_lock(); - update_h_load(cpu_of(busiest)); + update_h_load(cpu_of(env->busiest_rq)); - for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { - unsigned long busiest_h_load = busiest_cfs_rq->h_load; - unsigned long busiest_weight = busiest_cfs_rq->load.weight; + for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) { + unsigned long busiest_h_load = env->busiest_cfs_rq->h_load; + unsigned long busiest_weight = env->busiest_cfs_rq->load.weight; u64 rem_load, moved_load; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) break; /* * empty group or part of a throttled hierarchy */ - if (!busiest_cfs_rq->task_weight || - throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu)) + if (!env->busiest_cfs_rq->task_weight) + continue; + + if (throttled_lb_pair(env->busiest_cfs_rq->tg, + cpu_of(env->busiest_rq), + env->this_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; rem_load = div_u64(rem_load, busiest_h_load + 1); - moved_load = balance_tasks(this_rq, this_cpu, busiest, - rem_load, sd, idle, lb_flags, - busiest_cfs_rq); + env->max_load_move = rem_load; + moved_load = balance_tasks(env); if (!moved_load) continue; @@ -3416,15 +3418,10 @@ static inline void update_shares(int cpu) { } -static unsigned long -load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static unsigned long load_balance_fair(struct lb_env *env) { - return balance_tasks(this_rq, this_cpu, busiest, - max_load_move, sd, idle, lb_flags, - &busiest->cfs); + env->busiest_cfs_rq = &env->busiest_rq->cfs; + return balance_tasks(env); } #endif @@ -3435,21 +3432,17 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, * * Called with both runqueues locked. */ -static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_load_move, - struct sched_domain *sd, enum cpu_idle_type idle, - int *lb_flags) +static int move_tasks(struct lb_env *env) { + unsigned long max_load_move = env->max_load_move; unsigned long total_load_moved = 0, load_moved; do { - load_moved = load_balance_fair(this_rq, this_cpu, busiest, - max_load_move - total_load_moved, - sd, idle, lb_flags); - + env->max_load_move = max_load_move - total_load_moved; + load_moved = load_balance_fair(env); total_load_moved += load_moved; - if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT)) + if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) break; #ifdef CONFIG_PREEMPT @@ -3458,8 +3451,8 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) { - *lb_flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) { + env->flags |= LBF_ABORT; break; } #endif @@ -4459,13 +4452,20 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int ld_moved, lb_flags = 0, active_balance = 0; + int ld_moved, active_balance = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct lb_env env = { + .sd = sd, + .this_cpu = this_cpu, + .this_rq = this_rq, + .idle = idle, + }; + cpumask_copy(cpus, cpu_active_mask); schedstat_inc(sd, lb_count[idle]); @@ -4500,11 +4500,13 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; + env.max_load_move = imbalance; + env.busiest_rq = busiest; + local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle, &lb_flags); + ld_moved = move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -4514,18 +4516,18 @@ redo: if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (lb_flags & LBF_ABORT) + if (env.flags & LBF_ABORT) goto out_balanced; - if (lb_flags & LBF_NEED_BREAK) { - lb_flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (lb_flags & LBF_ABORT) + if (env.flags & LBF_NEED_BREAK) { + env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; + if (env.flags & LBF_ABORT) goto out_balanced; goto redo; } /* All tasks on this runqueue were pinned by CPU affinity */ - if (unlikely(lb_flags & LBF_ALL_PINNED)) { + if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) goto redo; @@ -4555,7 +4557,7 @@ redo: tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - lb_flags |= LBF_ALL_PINNED; + env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } @@ -4608,7 +4610,7 @@ out_balanced: out_one_pinned: /* tune up the balancing interval */ - if (((lb_flags & LBF_ALL_PINNED) && + if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; @@ -4718,10 +4720,17 @@ static int active_load_balance_cpu_stop(void *data) } if (likely(sd)) { + struct lb_env env = { + .sd = sd, + .this_cpu = target_cpu, + .this_rq = target_rq, + .busiest_rq = busiest_rq, + .idle = CPU_IDLE, + }; + schedstat_inc(sd, alb_count); - if (move_one_task(target_rq, target_cpu, busiest_rq, - sd, CPU_IDLE)) + if (move_one_task(&env)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); -- cgit v1.2.3 From ddcdf6e7d9919d139031fa2a6addd9544a9a833e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Feb 2012 19:27:40 +0100 Subject: sched: Rename load-balancing fields s/env->this_/env->dst_/g s/env->busiest_/env->src_/g s/pull_task/move_task/g Makes everything clearer. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-0yvgms8t8x962drpvl0fu0kk@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 118 ++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 55b1f117419a..233d05171bf5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2918,7 +2918,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; /* - * This is possible from callers such as pull_task(), in which we + * This is possible from callers such as move_task(), in which we * unconditionally check_prempt_curr() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. @@ -3084,17 +3084,37 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; +#define LBF_ALL_PINNED 0x01 +#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ +#define LBF_HAD_BREAK 0x04 +#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ +#define LBF_ABORT 0x10 + +struct lb_env { + struct sched_domain *sd; + + int src_cpu; + struct rq *src_rq; + struct cfs_rq *src_cfs_rq; + + int dst_cpu; + struct rq *dst_rq; + + enum cpu_idle_type idle; + unsigned long max_load_move; + unsigned int flags; +}; + /* - * pull_task - move a task from a remote runqueue to the local runqueue. + * move_task - move a task from one runqueue to another runqueue. * Both runqueues must be locked. */ -static void pull_task(struct rq *src_rq, struct task_struct *p, - struct rq *this_rq, int this_cpu) +static void move_task(struct task_struct *p, struct lb_env *env) { - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - check_preempt_curr(this_rq, p, 0); + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + activate_task(env->dst_rq, p, 0); + check_preempt_curr(env->dst_rq, p, 0); } /* @@ -3129,26 +3149,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) return delta < (s64)sysctl_sched_migration_cost; } -#define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 - -struct lb_env { - struct sched_domain *sd; - - int this_cpu; - struct rq *this_rq; - - struct rq *busiest_rq; - struct cfs_rq *busiest_cfs_rq; - - enum cpu_idle_type idle; - unsigned long max_load_move; - unsigned int flags; -}; - /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -3162,13 +3162,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (!cpumask_test_cpu(env->this_cpu, tsk_cpus_allowed(p))) { + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { schedstat_inc(p, se.statistics.nr_failed_migrations_affine); return 0; } env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->busiest_rq, p)) { + if (task_running(env->src_rq, p)) { schedstat_inc(p, se.statistics.nr_failed_migrations_running); return 0; } @@ -3179,7 +3179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) * 2) too many balance attempts have failed. */ - tsk_cache_hot = task_hot(p, env->busiest_rq->clock_task, env->sd); + tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { #ifdef CONFIG_SCHEDSTATS @@ -3210,20 +3210,20 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(env->busiest_rq, cfs_rq) { + for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { if (throttled_lb_pair(task_group(p), - env->busiest_rq->cpu, env->this_cpu)) + env->src_cpu, env->dst_cpu)) break; if (!can_migrate_task(p, env)) continue; - pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); + move_task(p, env); /* - * Right now, this is only the second place pull_task() - * is called, so we can safely collect pull_task() - * stats here rather than inside pull_task(). + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). */ schedstat_inc(env->sd, lb_gained[env->idle]); return 1; @@ -3242,7 +3242,7 @@ static unsigned long balance_tasks(struct lb_env *env) if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &env->busiest_cfs_rq->tasks, se.group_node) { + list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { if (loops++ > sysctl_sched_nr_migrate) { env->flags |= LBF_NEED_BREAK; break; @@ -3252,7 +3252,7 @@ static unsigned long balance_tasks(struct lb_env *env) !can_migrate_task(p, env)) continue; - pull_task(env->busiest_rq, p, env->this_rq, env->this_cpu); + move_task(p, env); pulled++; rem_load_move -= p->se.load.weight; @@ -3277,9 +3277,9 @@ static unsigned long balance_tasks(struct lb_env *env) } out: /* - * Right now, this is one of only two places pull_task() is called, - * so we can safely collect pull_task() stats here rather than - * inside pull_task(). + * Right now, this is one of only two places move_task() is called, + * so we can safely collect move_task() stats here rather than + * inside move_task(). */ schedstat_add(env->sd, lb_gained[env->idle], pulled); @@ -3372,11 +3372,11 @@ static unsigned long load_balance_fair(struct lb_env *env) long rem_load_move = env->max_load_move; rcu_read_lock(); - update_h_load(cpu_of(env->busiest_rq)); + update_h_load(cpu_of(env->src_rq)); - for_each_leaf_cfs_rq(env->busiest_rq, env->busiest_cfs_rq) { - unsigned long busiest_h_load = env->busiest_cfs_rq->h_load; - unsigned long busiest_weight = env->busiest_cfs_rq->load.weight; + for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { + unsigned long busiest_h_load = env->src_cfs_rq->h_load; + unsigned long busiest_weight = env->src_cfs_rq->load.weight; u64 rem_load, moved_load; if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) @@ -3385,12 +3385,12 @@ static unsigned long load_balance_fair(struct lb_env *env) /* * empty group or part of a throttled hierarchy */ - if (!env->busiest_cfs_rq->task_weight) + if (!env->src_cfs_rq->task_weight) continue; - if (throttled_lb_pair(env->busiest_cfs_rq->tg, - cpu_of(env->busiest_rq), - env->this_cpu)) + if (throttled_lb_pair(env->src_cfs_rq->tg, + cpu_of(env->src_rq), + env->dst_cpu)) continue; rem_load = (u64)rem_load_move * busiest_weight; @@ -3420,7 +3420,7 @@ static inline void update_shares(int cpu) static unsigned long load_balance_fair(struct lb_env *env) { - env->busiest_cfs_rq = &env->busiest_rq->cfs; + env->src_cfs_rq = &env->src_rq->cfs; return balance_tasks(env); } #endif @@ -3451,7 +3451,7 @@ static int move_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE && env->this_rq->nr_running) { + if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { env->flags |= LBF_ABORT; break; } @@ -4461,8 +4461,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct lb_env env = { .sd = sd, - .this_cpu = this_cpu, - .this_rq = this_rq, + .dst_cpu = this_cpu, + .dst_rq = this_rq, .idle = idle, }; @@ -4502,7 +4502,8 @@ redo: */ env.flags |= LBF_ALL_PINNED; env.max_load_move = imbalance; - env.busiest_rq = busiest; + env.src_cpu = busiest->cpu; + env.src_rq = busiest; local_irq_save(flags); double_rq_lock(this_rq, busiest); @@ -4722,9 +4723,10 @@ static int active_load_balance_cpu_stop(void *data) if (likely(sd)) { struct lb_env env = { .sd = sd, - .this_cpu = target_cpu, - .this_rq = target_rq, - .busiest_rq = busiest_rq, + .dst_cpu = target_cpu, + .dst_rq = target_rq, + .src_cpu = busiest_rq->cpu, + .src_rq = busiest_rq, .idle = CPU_IDLE, }; -- cgit v1.2.3 From 367456c756a6b84f493ca9cc5b17b1f5d38ef466 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 20 Feb 2012 21:49:09 +0100 Subject: sched: Ditch per cgroup task lists for load-balancing Per cgroup load-balance has numerous problems, chief amongst them that there is no real sane order in them. So stop pretending it makes sense and enqueue all tasks on a single list. This also allows us to more easily fix the fwd progress issue uncovered by the lock-break stuff. Rotate the list on failure to migreate and limit the total iterations to nr_running (which with releasing the lock isn't strictly accurate but close enough). Also add a filter that skips very light tasks on the first attempt around the list, this attempts to avoid shooting whole cgroups around without affecting over balance. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Link: http://lkml.kernel.org/n/tip-tx8yqydc7eimgq7i4rkc3a4g@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 + kernel/sched/fair.c | 176 ++++++++++++++++++++++----------------------------- kernel/sched/sched.h | 10 +-- 3 files changed, 80 insertions(+), 109 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 25e06a5e048b..95545126be1c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6959,6 +6959,9 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; + + INIT_LIST_HEAD(&rq->cfs_tasks); + rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ rq->nohz_flags = 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 233d05171bf5..a0424fc4cc54 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -776,29 +776,16 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ -#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED -static void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ - cfs_rq->task_weight += weight; -} -#else -static inline void -add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) -{ -} -#endif - static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, se->load.weight); - list_add(&se->group_node, &cfs_rq->tasks); - } +#ifdef CONFIG_SMP + if (entity_is_task(se)) + list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); +#endif cfs_rq->nr_running++; } @@ -808,10 +795,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); - if (entity_is_task(se)) { - add_cfs_task_weight(cfs_rq, -se->load.weight); + if (entity_is_task(se)) list_del_init(&se->group_node); - } cfs_rq->nr_running--; } @@ -3085,17 +3070,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 -#define LBF_NEED_BREAK 0x02 /* clears into HAD_BREAK */ -#define LBF_HAD_BREAK 0x04 -#define LBF_HAD_BREAKS 0x0C /* count HAD_BREAKs overflows into ABORT */ -#define LBF_ABORT 0x10 +#define LBF_NEED_BREAK 0x02 +#define LBF_ABORT 0x04 struct lb_env { struct sched_domain *sd; int src_cpu; struct rq *src_rq; - struct cfs_rq *src_cfs_rq; int dst_cpu; struct rq *dst_rq; @@ -3103,6 +3085,10 @@ struct lb_env { enum cpu_idle_type idle; unsigned long max_load_move; unsigned int flags; + + unsigned int loop; + unsigned int loop_break; + unsigned int loop_max; }; /* @@ -3208,53 +3194,69 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) static int move_one_task(struct lb_env *env) { struct task_struct *p, *n; - struct cfs_rq *cfs_rq; - for_each_leaf_cfs_rq(env->src_rq, cfs_rq) { - list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), - env->src_cpu, env->dst_cpu)) - break; + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) + continue; - if (!can_migrate_task(p, env)) - continue; + if (!can_migrate_task(p, env)) + continue; - move_task(p, env); - /* - * Right now, this is only the second place move_task() - * is called, so we can safely collect move_task() - * stats here rather than inside move_task(). - */ - schedstat_inc(env->sd, lb_gained[env->idle]); - return 1; - } + move_task(p, env); + /* + * Right now, this is only the second place move_task() + * is called, so we can safely collect move_task() + * stats here rather than inside move_task(). + */ + schedstat_inc(env->sd, lb_gained[env->idle]); + return 1; } - return 0; } +static unsigned long task_h_load(struct task_struct *p); + static unsigned long balance_tasks(struct lb_env *env) { - int loops = 0, pulled = 0; long rem_load_move = env->max_load_move; struct task_struct *p, *n; + unsigned long load; + int pulled = 0; if (env->max_load_move == 0) goto out; - list_for_each_entry_safe(p, n, &env->src_cfs_rq->tasks, se.group_node) { - if (loops++ > sysctl_sched_nr_migrate) { + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + env->loop++; + /* We've more or less seen every task there is, call it quits */ + if (env->loop > env->loop_max) { + env->flags |= LBF_ABORT; + break; + } + /* take a beather every nr_migrate tasks */ + if (env->loop > env->loop_break) { + env->loop_break += sysctl_sched_nr_migrate; env->flags |= LBF_NEED_BREAK; break; } - if ((p->se.load.weight >> 1) > rem_load_move || - !can_migrate_task(p, env)) - continue; + if (throttled_lb_pair(task_group(p), env->src_rq->cpu, + env->dst_cpu)) + goto next; + + load = task_h_load(p); + if (load < 16 && !env->sd->nr_balance_failed) + goto next; + + if ((load * 2) > rem_load_move) + goto next; + + if (!can_migrate_task(p, env)) + goto next; move_task(p, env); pulled++; - rem_load_move -= p->se.load.weight; + rem_load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3274,6 +3276,10 @@ static unsigned long balance_tasks(struct lb_env *env) */ if (rem_load_move <= 0) break; + + continue; +next: + list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); } out: /* @@ -3363,65 +3369,33 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_h_load(long cpu) { + rcu_read_lock(); walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); + rcu_read_unlock(); } -static unsigned long load_balance_fair(struct lb_env *env) +static unsigned long task_h_load(struct task_struct *p) { - unsigned long max_load_move = env->max_load_move; - long rem_load_move = env->max_load_move; - - rcu_read_lock(); - update_h_load(cpu_of(env->src_rq)); - - for_each_leaf_cfs_rq(env->src_rq, env->src_cfs_rq) { - unsigned long busiest_h_load = env->src_cfs_rq->h_load; - unsigned long busiest_weight = env->src_cfs_rq->load.weight; - u64 rem_load, moved_load; - - if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - - /* - * empty group or part of a throttled hierarchy - */ - if (!env->src_cfs_rq->task_weight) - continue; - - if (throttled_lb_pair(env->src_cfs_rq->tg, - cpu_of(env->src_rq), - env->dst_cpu)) - continue; - - rem_load = (u64)rem_load_move * busiest_weight; - rem_load = div_u64(rem_load, busiest_h_load + 1); - - env->max_load_move = rem_load; - - moved_load = balance_tasks(env); - if (!moved_load) - continue; - - moved_load *= busiest_h_load; - moved_load = div_u64(moved_load, busiest_weight + 1); + struct cfs_rq *cfs_rq = task_cfs_rq(p); + unsigned long load; - rem_load_move -= moved_load; - if (rem_load_move < 0) - break; - } - rcu_read_unlock(); + load = p->se.load.weight; + load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1); - return max_load_move - rem_load_move; + return load; } #else static inline void update_shares(int cpu) { } -static unsigned long load_balance_fair(struct lb_env *env) +static inline void update_h_load(long cpu) { - env->src_cfs_rq = &env->src_rq->cfs; - return balance_tasks(env); +} + +static unsigned long task_h_load(struct task_struct *p) +{ + return p->se.load.weight; } #endif @@ -3437,9 +3411,10 @@ static int move_tasks(struct lb_env *env) unsigned long max_load_move = env->max_load_move; unsigned long total_load_moved = 0, load_moved; + update_h_load(cpu_of(env->src_rq)); do { env->max_load_move = max_load_move - total_load_moved; - load_moved = load_balance_fair(env); + load_moved = balance_tasks(env); total_load_moved += load_moved; if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) @@ -4464,6 +4439,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .dst_cpu = this_cpu, .dst_rq = this_rq, .idle = idle, + .loop_break = sysctl_sched_nr_migrate, }; cpumask_copy(cpus, cpu_active_mask); @@ -4504,6 +4480,7 @@ redo: env.max_load_move = imbalance; env.src_cpu = busiest->cpu; env.src_rq = busiest; + env.loop_max = busiest->nr_running; local_irq_save(flags); double_rq_lock(this_rq, busiest); @@ -4521,9 +4498,7 @@ redo: goto out_balanced; if (env.flags & LBF_NEED_BREAK) { - env.flags += LBF_HAD_BREAK - LBF_NEED_BREAK; - if (env.flags & LBF_ABORT) - goto out_balanced; + env.flags &= ~LBF_NEED_BREAK; goto redo; } @@ -5357,7 +5332,6 @@ static void set_curr_task_fair(struct rq *rq) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; - INIT_LIST_HEAD(&cfs_rq->tasks); cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c0660a1a0cd1..753bdd567416 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -212,9 +212,6 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct list_head tasks; - struct list_head *balance_iterator; - /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). @@ -241,11 +238,6 @@ struct cfs_rq { struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP - /* - * the part of load.weight contributed by tasks - */ - unsigned long task_weight; - /* * h_load = weight * f(tg) * @@ -420,6 +412,8 @@ struct rq { int cpu; int online; + struct list_head cfs_tasks; + u64 rt_avg; u64 age_stamp; u64 idle_stamp; -- cgit v1.2.3 From b892e5c89787716b95a8e55d77d25a1c0748df10 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 1 Mar 2012 22:06:48 -0500 Subject: tracing: Keep NMI watchdog from triggering when dumping trace As ftrace_dump() (called by ftrace_dump_on_oops) disables interrupts as it dumps its output to the console, it can keep interrupts disabled for long periods of time. This is likely to trigger the NMI watchdog, and it can disrupt the output of critical data. Add a touch_nmi_watchdog() to each event that is written to the screen to keep the NMI watchdog from affecting the output. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3c13d63d064..3a19c354edd6 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "trace.h" @@ -4903,6 +4904,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) if (ret != TRACE_TYPE_NO_CONSUME) trace_consume(&iter); } + touch_nmi_watchdog(); trace_printk_seq(&iter.seq); } -- cgit v1.2.3 From 62d3c5439c534b0e6c653fc63e6d8c67be3a57b1 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 2 Mar 2012 10:51:00 +0100 Subject: Block: use a freezable workqueue for disk-event polling This patch (as1519) fixes a bug in the block layer's disk-events polling. The polling is done by a work routine queued on the system_nrt_wq workqueue. Since that workqueue isn't freezable, the polling continues even in the middle of a system sleep transition. Obviously, polling a suspended drive for media changes and such isn't a good thing to do; in the case of USB mass-storage devices it can lead to real problems requiring device resets and even re-enumeration. The patch fixes things by creating a new system-wide, non-reentrant, freezable workqueue and using it for disk-events polling. Signed-off-by: Alan Stern CC: Acked-by: Tejun Heo Acked-by: Rafael J. Wysocki Signed-off-by: Jens Axboe --- block/genhd.c | 10 +++++----- include/linux/workqueue.h | 4 ++++ kernel/workqueue.c | 7 ++++++- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/block/genhd.c b/block/genhd.c index b26c4085590d..df9816ede75b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1478,9 +1478,9 @@ static void __disk_unblock_events(struct gendisk *disk, bool check_now) intv = disk_events_poll_jiffies(disk); set_timer_slack(&ev->dwork.timer, intv / 4); if (check_now) - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); else if (intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv); out_unlock: spin_unlock_irqrestore(&ev->lock, flags); } @@ -1524,7 +1524,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) ev->clearing |= mask; if (!ev->block) { cancel_delayed_work(&ev->dwork); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); } spin_unlock_irq(&ev->lock); } @@ -1561,7 +1561,7 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) /* uncondtionally schedule event check and wait for it to finish */ disk_block_events(disk); - queue_delayed_work(system_nrt_wq, &ev->dwork, 0); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, 0); flush_delayed_work(&ev->dwork); __disk_unblock_events(disk, false); @@ -1598,7 +1598,7 @@ static void disk_events_workfn(struct work_struct *work) intv = disk_events_poll_jiffies(disk); if (!ev->block && intv) - queue_delayed_work(system_nrt_wq, &ev->dwork, intv); + queue_delayed_work(system_nrt_freezable_wq, &ev->dwork, intv); spin_unlock_irq(&ev->lock); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index eb8b9f15f2e0..af155450cabb 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -289,12 +289,16 @@ enum { * * system_freezable_wq is equivalent to system_wq except that it's * freezable. + * + * system_nrt_freezable_wq is equivalent to system_nrt_wq except that + * it's freezable. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_nrt_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; +extern struct workqueue_struct *system_nrt_freezable_wq; extern struct workqueue_struct * __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bec7b5b53e03..f2c5638bb5ab 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -253,11 +253,13 @@ struct workqueue_struct *system_long_wq __read_mostly; struct workqueue_struct *system_nrt_wq __read_mostly; struct workqueue_struct *system_unbound_wq __read_mostly; struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_nrt_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); EXPORT_SYMBOL_GPL(system_freezable_wq); +EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); #define CREATE_TRACE_POINTS #include @@ -3833,8 +3835,11 @@ static int __init init_workqueues(void) WQ_UNBOUND_MAX_ACTIVE); system_freezable_wq = alloc_workqueue("events_freezable", WQ_FREEZABLE, 0); + system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", + WQ_NON_REENTRANT | WQ_FREEZABLE, 0); BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || - !system_unbound_wq || !system_freezable_wq); + !system_unbound_wq || !system_freezable_wq || + !system_nrt_freezable_wq); return 0; } early_initcall(init_workqueues); -- cgit v1.2.3 From 2e5b5b3a1b7768c89fbfeca18e75f8ee377e924c Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 23 Feb 2012 17:41:27 +0900 Subject: sched: Clean up parameter passing of proc_sched_autogroup_set_nice() Pass nice as a value to proc_sched_autogroup_set_nice(). No side effect is expected, and the variable err will be overwritten with the return value. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/4F45FBB7.5090607@ct.jp.nec.com Signed-off-by: Ingo Molnar --- fs/proc/base.c | 3 +-- include/linux/sched.h | 2 +- kernel/sched/auto_group.c | 12 ++++++------ 3 files changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index d4548dd49b02..965d4bde3a3b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1310,8 +1310,7 @@ sched_autogroup_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - err = nice; - err = proc_sched_autogroup_set_nice(p, &err); + err = proc_sched_autogroup_set_nice(p, nice); if (err) count = err; diff --git a/include/linux/sched.h b/include/linux/sched.h index c628a9151437..c298fb9cf5ad 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2065,7 +2065,7 @@ extern void sched_autogroup_fork(struct signal_struct *sig); extern void sched_autogroup_exit(struct signal_struct *sig); #ifdef CONFIG_PROC_FS extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m); -extern int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice); +extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice); #endif #else static inline void sched_autogroup_create_attach(struct task_struct *p) { } diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index e8a1f83ee0e7..0984a21076a3 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -195,20 +195,20 @@ __setup("noautogroup", setup_autogroup); #ifdef CONFIG_PROC_FS -int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) { static unsigned long next = INITIAL_JIFFIES; struct autogroup *ag; int err; - if (*nice < -20 || *nice > 19) + if (nice < -20 || nice > 19) return -EINVAL; - err = security_task_setnice(current, *nice); + err = security_task_setnice(current, nice); if (err) return err; - if (*nice < 0 && !can_nice(current, *nice)) + if (nice < 0 && !can_nice(current, nice)) return -EPERM; /* this is a heavy operation taking global locks.. */ @@ -219,9 +219,9 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); if (!err) - ag->nice = *nice; + ag->nice = nice; up_write(&ag->lock); autogroup_kref_put(ag); -- cgit v1.2.3 From 05b4877f6a4f1ba4952d1222213d262bf8c132b7 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Fri, 17 Feb 2012 23:39:51 +0100 Subject: PM / Hibernate: Enable usermodehelpers in hibernate() error path If create_basic_memory_bitmaps() fails, usermodehelpers are not re-enabled before returning. Fix this. And while at it, reword the goto labels so that they look more meaningful. Signed-off-by: Srivatsa S. Bhat Cc: stable@vger.kernel.org Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 72baaf011fb7..0a186cfde788 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -618,7 +618,7 @@ int hibernate(void) /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) - goto Exit; + goto Enable_umh; printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); @@ -626,7 +626,7 @@ int hibernate(void) error = freeze_processes(); if (error) - goto Finish; + goto Free_bitmaps; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) @@ -659,8 +659,9 @@ int hibernate(void) /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; - Finish: + Free_bitmaps: free_basic_memory_bitmaps(); + Enable_umh: usermodehelper_enable(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); -- cgit v1.2.3 From 37f08be11be9a7d9351fb1b9b408259519a126f3 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Tue, 21 Feb 2012 23:57:47 +0100 Subject: PM / Freezer: Remove references to TIF_FREEZE in comments This patch removes all the references in the code about the TIF_FREEZE flag removed by commit a3201227f803ad7fd43180c5195dbe5a2bf998aa freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE There still are some references to TIF_FREEZE in Documentation/power/freezing-of-tasks.txt, but it looks like that documentation needs more thorough work to reflect how the new freezer works, and hence merely removing the references to TIF_FREEZE won't really help. So I have not touched that part in this patch. Suggested-by: Srivatsa S. Bhat Signed-off-by: Marcos Paulo de Souza Signed-off-by: Rafael J. Wysocki --- kernel/exit.c | 2 +- kernel/freezer.c | 6 +++--- kernel/power/process.c | 8 +++----- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 294b1709170d..fd0af05e0639 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -424,7 +424,7 @@ void daemonize(const char *name, ...) */ exit_mm(current); /* - * We don't want to have TIF_FREEZE set if the system-wide hibernation + * We don't want to get frozen, in case system-wide hibernation * or suspend transition begins right now. */ current->flags |= (PF_NOFREEZE | PF_KTHREAD); diff --git a/kernel/freezer.c b/kernel/freezer.c index 9815b8d1eed5..11f82a4d4eae 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -99,9 +99,9 @@ static void fake_signal_wake_up(struct task_struct *p) * freeze_task - send a freeze request to given task * @p: task to send the request to * - * If @p is freezing, the freeze request is sent by setting %TIF_FREEZE - * flag and either sending a fake signal to it or waking it up, depending - * on whether it has %PF_FREEZER_NOSIG set. + * If @p is freezing, the freeze request is sent either by sending a fake + * signal (if it's not a kernel thread) or waking it up (if it's a kernel + * thread). * * RETURNS: * %false, if @p is not freezing or already frozen; %true, otherwise diff --git a/kernel/power/process.c b/kernel/power/process.c index 6aeb5efe00eb..0d2aeb226108 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -53,11 +53,9 @@ static int try_to_freeze_tasks(bool user_only) * It is "frozen enough". If the task does wake * up, it will immediately call try_to_freeze. * - * Because freeze_task() goes through p's - * scheduler lock after setting TIF_FREEZE, it's - * guaranteed that either we see TASK_RUNNING or - * try_to_stop() after schedule() in ptrace/signal - * stop sees TIF_FREEZE. + * Because freeze_task() goes through p's scheduler lock, it's + * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING + * transition can't race with task state testing here. */ if (!task_is_stopped_or_traced(p) && !freezer_should_skip(p)) -- cgit v1.2.3 From bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:51 +0100 Subject: perf: Add generic taken branch sampling support This patch adds the ability to sample taken branches to the perf_event interface. The ability to capture taken branches is very useful for all sorts of analysis. For instance, basic block profiling, call counts, statistical call graph. This new capability requires hardware assist and as such may not be available on all HW platforms. On Intel x86 it is implemented on top of the Last Branch Record (LBR) facility. To enable taken branches sampling, the PERF_SAMPLE_BRANCH_STACK bit must be set in attr->sample_type. Sampled taken branches may be filtered by type and/or priv levels. The patch adds a new field, called branch_sample_type, to the perf_event_attr structure. It contains a bitmask of filters to apply to the sampled taken branches. Filters may be implemented in HW. If the HW filter does not exist or is not good enough, some arch may also implement a SW filter. The following generic filters are currently defined: - PERF_SAMPLE_USER only branches whose targets are at the user level - PERF_SAMPLE_KERNEL only branches whose targets are at the kernel level - PERF_SAMPLE_HV only branches whose targets are at the hypervisor level - PERF_SAMPLE_ANY any type of branches (subject to priv levels filters) - PERF_SAMPLE_ANY_CALL any call branches (may incl. syscall on some arch) - PERF_SAMPLE_ANY_RET any return branches (may incl. syscall returns on some arch) - PERF_SAMPLE_IND_CALL indirect call branches Obviously filter may be combined. The priv level bits are optional. If not provided, the priv level of the associated event are used. It is possible to collect branches at a priv level different from the associated event. Use of kernel, hv priv levels is subject to permissions and availability (hv). The number of taken branch records present in each sample may vary based on HW, the type of sampled branches, the executed code. Therefore each sample contains the number of taken branches it contains. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 21 +++++---- include/linux/perf_event.h | 71 ++++++++++++++++++++++++++++-- kernel/events/core.c | 68 ++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 47a7e63bfe54..309d0cc69163 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -142,9 +142,11 @@ static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); - cpuc->lbr_entries[i].from = msr_lastbranch.from; - cpuc->lbr_entries[i].to = msr_lastbranch.to; - cpuc->lbr_entries[i].flags = 0; + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } @@ -165,19 +167,22 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) for (i = 0; i < x86_pmu.lbr_nr; i++) { unsigned long lbr_idx = (tos - i) & mask; - u64 from, to, flags = 0; + u64 from, to, mis = 0, pred = 0; rdmsrl(x86_pmu.lbr_from + lbr_idx, from); rdmsrl(x86_pmu.lbr_to + lbr_idx, to); if (lbr_format == LBR_FORMAT_EIP_FLAGS) { - flags = !!(from & LBR_FROM_FLAG_MISPRED); + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; from = (u64)((((s64)from) << 1) >> 1); } - cpuc->lbr_entries[i].from = from; - cpuc->lbr_entries[i].to = to; - cpuc->lbr_entries[i].flags = flags; + cpuc->lbr_entries[i].from = from; + cpuc->lbr_entries[i].to = to; + cpuc->lbr_entries[i].mispred = mis; + cpuc->lbr_entries[i].predicted = pred; + cpuc->lbr_entries[i].reserved = 0; } cpuc->lbr_stack.nr = i; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 64426b71381f..5fc494f4a094 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,10 +129,39 @@ enum perf_event_sample_format { PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 12, /* non-ABI */ }; +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL = 1U << 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV = 1U << 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY = 1U << 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL = 1U << 6, /* indirect calls */ + + PERF_SAMPLE_BRANCH_MAX = 1U << 7, /* non-ABI */ +}; + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + /* * The format of the data returned by read() on a perf event fd, * as specified by attr.read_format: @@ -240,6 +269,7 @@ struct perf_event_attr { __u64 bp_len; __u64 config2; /* extension of config1 */ }; + __u64 branch_sample_type; /* enum branch_sample_type */ }; /* @@ -458,6 +488,8 @@ enum perf_event_type { * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 from, to, flags } lbr[nr];} && PERF_SAMPLE_BRANCH_STACK * }; */ PERF_RECORD_SAMPLE = 9, @@ -530,12 +562,34 @@ struct perf_raw_record { void *data; }; +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + */ struct perf_branch_entry { - __u64 from; - __u64 to; - __u64 flags; + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + reserved:62; }; +/* + * branch stack layout: + * nr: number of taken branches stored in entries[] + * + * Note that nr can vary from sample to sample + * branches (to, from) are stored from most recent + * to least recent, i.e., entries[0] contains the most + * recent branch. + */ struct perf_branch_stack { __u64 nr; struct perf_branch_entry entries[0]; @@ -566,7 +620,9 @@ struct hw_perf_event { unsigned long event_base; int idx; int last_cpu; + struct hw_perf_event_extra extra_reg; + struct hw_perf_event_extra branch_reg; }; struct { /* software */ struct hrtimer hrtimer; @@ -1007,12 +1063,14 @@ struct perf_sample_data { u64 period; struct perf_callchain_entry *callchain; struct perf_raw_record *raw; + struct perf_branch_stack *br_stack; }; static inline void perf_sample_data_init(struct perf_sample_data *data, u64 addr) { data->addr = addr; data->raw = NULL; + data->br_stack = NULL; } extern void perf_output_sample(struct perf_output_handle *handle, @@ -1151,6 +1209,11 @@ extern void perf_bp_event(struct perf_event *event, void *data); # define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +static inline bool has_branch_stack(struct perf_event *event) +{ + return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/kernel/events/core.c b/kernel/events/core.c index e8b32ac75ce3..5820efdf47cd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -118,6 +118,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info) PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP) +/* + * branch priv levels that need permission checks + */ +#define PERF_SAMPLE_BRANCH_PERM_PLM \ + (PERF_SAMPLE_BRANCH_KERNEL |\ + PERF_SAMPLE_BRANCH_HV) + enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, @@ -3907,6 +3914,24 @@ void perf_output_sample(struct perf_output_handle *handle, } } } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + if (data->br_stack) { + size_t size; + + size = data->br_stack->nr + * sizeof(struct perf_branch_entry); + + perf_output_put(handle, data->br_stack->nr); + perf_output_copy(handle, data->br_stack->entries, size); + } else { + /* + * we always store at least the value of nr + */ + u64 nr = 0; + perf_output_put(handle, nr); + } + } } void perf_prepare_sample(struct perf_event_header *header, @@ -3949,6 +3974,15 @@ void perf_prepare_sample(struct perf_event_header *header, WARN_ON_ONCE(size & (sizeof(u64)-1)); header->size += size; } + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + int size = sizeof(u64); /* nr */ + if (data->br_stack) { + size += data->br_stack->nr + * sizeof(struct perf_branch_entry); + } + header->size += size; + } } static void perf_event_output(struct perf_event *event, @@ -5935,6 +5969,40 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->read_format & ~(PERF_FORMAT_MAX-1)) return -EINVAL; + if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) { + u64 mask = attr->branch_sample_type; + + /* only using defined bits */ + if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1)) + return -EINVAL; + + /* at least one branch bit must be set */ + if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) + return -EINVAL; + + /* kernel level capture: check permissions */ + if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM) + && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + /* propagate priv level, when not set for branch */ + if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { + + /* exclude_kernel checked on syscall entry */ + if (!attr->exclude_kernel) + mask |= PERF_SAMPLE_BRANCH_KERNEL; + + if (!attr->exclude_user) + mask |= PERF_SAMPLE_BRANCH_USER; + + if (!attr->exclude_hv) + mask |= PERF_SAMPLE_BRANCH_HV; + /* + * adjust user setting (for HW filter setup) + */ + attr->branch_sample_type = mask; + } + } out: return ret; -- cgit v1.2.3 From 2481c5fa6db0237e4f0168f88913178b2b495b7c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:20:59 +0100 Subject: perf: Disable PERF_SAMPLE_BRANCH_* when not supported PERF_SAMPLE_BRANCH_* is disabled for: - SW events (sw counters, tracepoints) - HW breakpoints - ALL but Intel x86 architecture - AMD64 processors Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-10-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/alpha/kernel/perf_event.c | 4 ++++ arch/arm/kernel/perf_event.c | 4 ++++ arch/mips/kernel/perf_event_mipsxx.c | 4 ++++ arch/powerpc/kernel/perf_event.c | 4 ++++ arch/sh/kernel/perf_event.c | 4 ++++ arch/sparc/kernel/perf_event.c | 4 ++++ arch/x86/kernel/cpu/perf_event_amd.c | 3 +++ kernel/events/core.c | 24 ++++++++++++++++++++++++ kernel/events/hw_breakpoint.c | 6 ++++++ 9 files changed, 57 insertions(+) (limited to 'kernel') diff --git a/arch/alpha/kernel/perf_event.c b/arch/alpha/kernel/perf_event.c index 8143cd7cdbfb..0dae252f7a33 100644 --- a/arch/alpha/kernel/perf_event.c +++ b/arch/alpha/kernel/perf_event.c @@ -685,6 +685,10 @@ static int alpha_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index 5bb91bf3d47f..a23c42abc694 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -539,6 +539,10 @@ static int armpmu_event_init(struct perf_event *event) int err = 0; atomic_t *active_events = &armpmu->active_events; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (armpmu->map_event(event) == -ENOENT) return -ENOENT; diff --git a/arch/mips/kernel/perf_event_mipsxx.c b/arch/mips/kernel/perf_event_mipsxx.c index e3b897acfbc0..811084f4e422 100644 --- a/arch/mips/kernel/perf_event_mipsxx.c +++ b/arch/mips/kernel/perf_event_mipsxx.c @@ -606,6 +606,10 @@ static int mipspmu_event_init(struct perf_event *event) { int err = 0; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index f04c2301725e..c2e27ede07ec 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1084,6 +1084,10 @@ static int power_pmu_event_init(struct perf_event *event) if (!ppmu) return -ENOENT; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index 10b14e3a7eb8..068b8a2759b5 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -310,6 +310,10 @@ static int sh_pmu_event_init(struct perf_event *event) { int err; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_RAW: case PERF_TYPE_HW_CACHE: diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 614da624330c..8e16a4a21582 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1105,6 +1105,10 @@ static int sparc_pmu_event_init(struct perf_event *event) if (atomic_read(&nmi_active) < 0) return -ENODEV; + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (attr->type) { case PERF_TYPE_HARDWARE: if (attr->config >= sparc_pmu->max_events) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 67250a52430b..dd002faff7a6 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -139,6 +139,9 @@ static int amd_pmu_hw_config(struct perf_event *event) if (ret) return ret; + if (has_branch_stack(event)) + return -EOPNOTSUPP; + if (event->attr.exclude_host && event->attr.exclude_guest) /* * When HO == GO == 1 the hardware treats that as GO == HO == 0 diff --git a/kernel/events/core.c b/kernel/events/core.c index 5820efdf47cd..242bb51c67f2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5044,6 +5044,12 @@ static int perf_swevent_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: case PERF_COUNT_SW_TASK_CLOCK: @@ -5154,6 +5160,12 @@ static int perf_tp_event_init(struct perf_event *event) if (event->attr.type != PERF_TYPE_TRACEPOINT) return -ENOENT; + /* + * no branch sampling for tracepoint events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + err = perf_trace_init(event); if (err) return err; @@ -5379,6 +5391,12 @@ static int cpu_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; @@ -5453,6 +5471,12 @@ static int task_clock_event_init(struct perf_event *event) if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) return -ENOENT; + /* + * no branch sampling for software events + */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + perf_swevent_init_hrtimer(event); return 0; diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 3330022a7ac1..bb38c4d3ee12 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -581,6 +581,12 @@ static int hw_breakpoint_event_init(struct perf_event *bp) if (bp->attr.type != PERF_TYPE_BREAKPOINT) return -ENOENT; + /* + * no branch sampling for breakpoint events + */ + if (has_branch_stack(bp)) + return -EOPNOTSUPP; + err = register_perf_hw_breakpoint(bp); if (err) return err; -- cgit v1.2.3 From d010b3326cf06b3406cdd88af16dcf4e4b6fec2e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 9 Feb 2012 23:21:00 +0100 Subject: perf: Add callback to flush branch_stack on context switch With branch stack sampling, it is possible to filter by priv levels. In system-wide mode, that means it is possible to capture only user level branches. The builtin SW LBR filter needs to disassemble code based on LBR captured addresses. For that, it needs to know the task the addresses are associated with. Because of context switches, the content of the branch stack buffer may contain addresses from different tasks. We need a callback on context switch to either flush the branch stack or save it. This patch adds a new callback in struct pmu which is called during context switches. The callback is called only when necessary. That is when a system-wide context has, at least, one event which uses PERF_SAMPLE_BRANCH_STACK. The callback is never called for per-thread context. In this version, the Intel x86 code simply flushes (resets) the LBR on context switches (fills it with zeroes). Those zeroed branches are then filtered out by the SW filter. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1328826068-11713-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 21 ++++++--- arch/x86/kernel/cpu/perf_event.h | 1 + arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++ include/linux/perf_event.h | 9 +++- kernel/events/core.c | 85 ++++++++++++++++++++++++++++++++++ 5 files changed, 121 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index cea567483274..0a18d16cb58d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1671,25 +1671,32 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_flush_branch_stack(void) +{ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); +} + static struct pmu pmu = { - .pmu_enable = x86_pmu_enable, - .pmu_disable = x86_pmu_disable, + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, .attr_groups = x86_pmu_attr_groups, .event_init = x86_pmu_event_init, - .add = x86_pmu_add, - .del = x86_pmu_del, - .start = x86_pmu_start, - .stop = x86_pmu_stop, - .read = x86_pmu_read, + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, .start_txn = x86_pmu_start_txn, .cancel_txn = x86_pmu_cancel_txn, .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, }; void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f104c054dc5c..74387c12dc72 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -324,6 +324,7 @@ struct x86_pmu { void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + void (*flush_branch_stack)(void); /* * Intel Arch Perfmon v2+ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7cc1e2dcc4dd..6627089232a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1539,6 +1539,18 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, @@ -1566,6 +1578,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, }; static __init void intel_clovertown_quirk(void) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5fc494f4a094..fbbf5e598368 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -746,6 +746,11 @@ struct pmu { * if no implementation is provided it will default to: event->hw.idx + 1. */ int (*event_idx) (struct perf_event *event); /*optional */ + + /* + * flush branch stack on context-switches (needed in cpu-wide mode) + */ + void (*flush_branch_stack) (void); }; /** @@ -979,7 +984,8 @@ struct perf_event_context { u64 parent_gen; u64 generation; int pin_count; - int nr_cgroups; /* cgroup events present */ + int nr_cgroups; /* cgroup evts */ + int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; }; @@ -1044,6 +1050,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); + struct perf_sample_data { u64 type; diff --git a/kernel/events/core.c b/kernel/events/core.c index 242bb51c67f2..c61234b1a988 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -137,6 +137,7 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); +static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -888,6 +889,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; + if (has_branch_stack(event)) + ctx->nr_branch_stack++; + list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) perf_pmu_rotate_start(ctx->pmu); @@ -1027,6 +1031,9 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } + if (has_branch_stack(event)) + ctx->nr_branch_stack--; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2201,6 +2208,66 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_pmu_rotate_start(ctx->pmu); } +/* + * When sampling the branck stack in system-wide, it may be necessary + * to flush the stack on context switch. This happens when the branch + * stack does not tag its entries with the pid of the current task. + * Otherwise it becomes impossible to associate a branch entry with a + * task. This ambiguity is more likely to appear when the branch stack + * supports priv level filtering and the user sets it to monitor only + * at the user level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch stack with + * branch from multiple tasks. Flushing may mean dropping the existing + * entries or stashing them somewhere in the PMU specific code layer. + * + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when there is at least one system-wide context + * with at least one active event using taken branch sampling. + */ +static void perf_branch_stack_sched_in(struct task_struct *prev, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* no need to flush branch stack if not changing task */ + if (prev == task) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + /* + * check if the context has at least one + * event using PERF_SAMPLE_BRANCH_STACK + */ + if (cpuctx->ctx.nr_branch_stack > 0 + && pmu->flush_branch_stack) { + + pmu = cpuctx->ctx.pmu; + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->flush_branch_stack(); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2232,6 +2299,10 @@ void __perf_event_task_sched_in(struct task_struct *prev, */ if (atomic_read(&__get_cpu_var(perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + + /* check for system-wide branch_stack events */ + if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) + perf_branch_stack_sched_in(prev, task); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) @@ -2798,6 +2869,14 @@ static void free_event(struct perf_event *event) atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); static_key_slow_dec_deferred(&perf_sched_events); } + + if (has_branch_stack(event)) { + static_key_slow_dec_deferred(&perf_sched_events); + /* is system-wide event */ + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } if (event->rb) { @@ -5924,6 +6003,12 @@ done: return ERR_PTR(err); } } + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } } return event; -- cgit v1.2.3 From c22ab332902333f83766017478c1ef6607ace681 Mon Sep 17 00:00:00 2001 From: Matthew Garrett Date: Mon, 5 Mar 2012 14:59:10 -0800 Subject: kmsg_dump: don't run on non-error paths by default Since commit 04c6862c055f ("kmsg_dump: add kmsg_dump() calls to the reboot, halt, poweroff and emergency_restart paths"), kmsg_dump() gets run on normal paths including poweroff and reboot. This is less than ideal given pstore implementations that can only represent single backtraces, since a reboot may overwrite a stored oops before it's been picked up by userspace. In addition, some pstore backends may have low performance and provide a significant delay in reboot as a result. This patch adds a printk.always_kmsg_dump kernel parameter (which can also be changed from userspace). Without it, the code will only be run on failure paths rather than on normal paths. The option can be enabled in environments where there's a desire to attempt to audit whether or not a reboot was cleanly requested or not. Signed-off-by: Matthew Garrett Acked-by: Seiji Aguchi Cc: Seiji Aguchi Cc: David Woodhouse Cc: Marco Stornelli Cc: Artem Bityutskiy Cc: KOSAKI Motohiro Cc: Vivek Goyal Cc: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 6 ++++++ include/linux/kmsg_dump.h | 9 +++++++-- kernel/printk.c | 6 ++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 033d4e69b43b..d99fd9c0ec0e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2211,6 +2211,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. default: off. + printk.always_kmsg_dump= + Trigger kmsg_dump for cases other than kernel oops or + panics + Format: (1/Y/y=enable, 0/N/n=disable) + default: disabled + printk.time= Show timing data prefixed to each printk message line Format: (1/Y/y=enable, 0/N/n=disable) diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index fee66317e071..35f7237ec972 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -15,13 +15,18 @@ #include #include +/* + * Keep this list arranged in rough order of priority. Anything listed after + * KMSG_DUMP_OOPS will not be logged by default unless printk.always_kmsg_dump + * is passed to the kernel. + */ enum kmsg_dump_reason { - KMSG_DUMP_OOPS, KMSG_DUMP_PANIC, + KMSG_DUMP_OOPS, + KMSG_DUMP_EMERG, KMSG_DUMP_RESTART, KMSG_DUMP_HALT, KMSG_DUMP_POWEROFF, - KMSG_DUMP_EMERG, }; /** diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..32690a0b7a18 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -702,6 +702,9 @@ static bool printk_time = 0; #endif module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); +static bool always_kmsg_dump; +module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR); + /* Check if we have any console registered that can be called early in boot. */ static int have_callable_console(void) { @@ -1732,6 +1735,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) unsigned long l1, l2; unsigned long flags; + if ((reason > KMSG_DUMP_OOPS) && !always_kmsg_dump) + return; + /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ -- cgit v1.2.3 From f986a499ef6f317d906e6f6f281be966e1237a10 Mon Sep 17 00:00:00 2001 From: Prashanth Nageshappa Date: Mon, 5 Mar 2012 14:59:12 -0800 Subject: kprobes: return proper error code from register_kprobe() register_kprobe() aborts if the address of the new request falls in a prohibited area (such as ftrace pouch, __kprobes annotated functions, non-kernel text addresses, jump label text). We however don't return the right error on this abort, resulting in a silent failure - incorrect adding/reporting of kprobes ('perf probe do_fork+18' or 'perf probe mcount' for instance). In V2 we are incorporating Masami Hiramatsu's feedback. This patch fixes it by returning -EINVAL upon failure. While we are here, rename the label used for exit to be more appropriate. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Prashanth K Nageshappa Acked-by: Masami Hiramatsu Cc: Jason Baron Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9788c0ec6f43..c62b8546cc90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1334,8 +1334,10 @@ int __kprobes register_kprobe(struct kprobe *p) if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || ftrace_text_reserved(p->addr, p->addr) || - jump_label_text_reserved(p->addr, p->addr)) - goto fail_with_jump_label; + jump_label_text_reserved(p->addr, p->addr)) { + ret = -EINVAL; + goto cannot_probe; + } /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ p->flags &= KPROBE_FLAG_DISABLED; @@ -1352,7 +1354,7 @@ int __kprobes register_kprobe(struct kprobe *p) * its code to prohibit unexpected unloading. */ if (unlikely(!try_module_get(probed_mod))) - goto fail_with_jump_label; + goto cannot_probe; /* * If the module freed .init.text, we couldn't insert @@ -1361,7 +1363,7 @@ int __kprobes register_kprobe(struct kprobe *p) if (within_module_init((unsigned long)p->addr, probed_mod) && probed_mod->state != MODULE_STATE_COMING) { module_put(probed_mod); - goto fail_with_jump_label; + goto cannot_probe; } /* ret will be updated by following code */ } @@ -1409,7 +1411,7 @@ out: return ret; -fail_with_jump_label: +cannot_probe: preempt_enable(); jump_label_unlock(); return ret; -- cgit v1.2.3 From c415c3b47ea2754659d915cca387a20999044163 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: vfork: introduce complete_vfork_done() No functional changes. Move the clear-and-complete-vfork_done code into the new trivial helper, complete_vfork_done(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 8 ++------ include/linux/sched.h | 1 + kernel/fork.c | 17 ++++++++++------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 92ce83a11e90..dccdcec913e9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1915,7 +1915,6 @@ static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - struct completion *vfork_done; int core_waiters = -EBUSY; init_completion(&core_state->startup); @@ -1934,11 +1933,8 @@ static int coredump_wait(int exit_code, struct core_state *core_state) * Make sure nobody is waiting for us to release the VM, * otherwise we can deadlock when we wait on each other */ - vfork_done = tsk->vfork_done; - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + if (tsk->vfork_done) + complete_vfork_done(tsk); if (core_waiters) wait_for_completion(&core_state->startup); diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d379a6bfd88..1b25a37f2aee 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,6 +2291,7 @@ extern int do_execve(const char *, const char __user * const __user *, const char __user * const __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); +extern void complete_vfork_done(struct task_struct *tsk); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); diff --git a/kernel/fork.c b/kernel/fork.c index e2cd3e2a5ae8..cf3d96379608 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -668,6 +668,14 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return mm; } +void complete_vfork_done(struct task_struct *tsk) +{ + struct completion *vfork_done = tsk->vfork_done; + + tsk->vfork_done = NULL; + complete(vfork_done); +} + /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. @@ -683,8 +691,6 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) */ void mm_release(struct task_struct *tsk, struct mm_struct *mm) { - struct completion *vfork_done = tsk->vfork_done; - /* Get rid of any futexes when releasing the mm */ #ifdef CONFIG_FUTEX if (unlikely(tsk->robust_list)) { @@ -704,11 +710,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) /* Get rid of any cached register state */ deactivate_mm(tsk, mm); - /* notify parent sleeping on vfork() */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + if (tsk->vfork_done) + complete_vfork_done(tsk); /* * If we're exiting normally, clear a user-space tid field if -- cgit v1.2.3 From d68b46fe16ad59b3a5f51ec73daaa5dc06753798 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: vfork: make it killable Make vfork() killable. Change do_fork(CLONE_VFORK) to do wait_for_completion_killable(). If it fails we do not return to the user-mode and never touch the memory shared with our child. However, in this case we should clear child->vfork_done before return, we use task_lock() in do_fork()->wait_for_vfork_done() and complete_vfork_done() to serialize with each other. Note: now that we use task_lock() we don't really need completion, we could turn task->vfork_done into "task_struct *wake_up_me" but this needs some complications. NOTE: this and the next patches do not affect in-kernel users of CLONE_VFORK, kernel threads run with all signals ignored including SIGKILL/SIGSTOP. However this is obviously the user-visible change. Not only a fatal signal can kill the vforking parent, a sub-thread can do execve or exit_group() and kill the thread sleeping in vfork(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 +- kernel/fork.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 1b25a37f2aee..b6467711f12e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2372,7 +2372,7 @@ static inline int thread_group_empty(struct task_struct *p) * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and - * ->cgroup.subsys[]. + * ->cgroup.subsys[]. And ->vfork_done. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), diff --git a/kernel/fork.c b/kernel/fork.c index cf3d96379608..892c534ce6e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -670,10 +670,34 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) void complete_vfork_done(struct task_struct *tsk) { - struct completion *vfork_done = tsk->vfork_done; + struct completion *vfork; - tsk->vfork_done = NULL; - complete(vfork_done); + task_lock(tsk); + vfork = tsk->vfork_done; + if (likely(vfork)) { + tsk->vfork_done = NULL; + complete(vfork); + } + task_unlock(tsk); +} + +static int wait_for_vfork_done(struct task_struct *child, + struct completion *vfork) +{ + int killed; + + freezer_do_not_count(); + killed = wait_for_completion_killable(vfork); + freezer_count(); + + if (killed) { + task_lock(child); + child->vfork_done = NULL; + task_unlock(child); + } + + put_task_struct(child); + return killed; } /* Please note the differences between mmput and mm_release. @@ -717,7 +741,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) * If we're exiting normally, clear a user-space tid field if * requested. We leave this alone when dying by signal, to leave * the value intact in a core dump, and to save the unnecessary - * trouble otherwise. Userland only wants this done for a sys_exit. + * trouble, say, a killed vfork parent shouldn't touch this mm. + * Userland only wants this done for a sys_exit. */ if (tsk->clear_child_tid) { if (!(tsk->flags & PF_SIGNALED) && @@ -1551,6 +1576,7 @@ long do_fork(unsigned long clone_flags, if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); + get_task_struct(p); } /* @@ -1568,10 +1594,8 @@ long do_fork(unsigned long clone_flags, ptrace_event(trace, nr); if (clone_flags & CLONE_VFORK) { - freezer_do_not_count(); - wait_for_completion(&vfork); - freezer_count(); - ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event(PTRACE_EVENT_VFORK_DONE, nr); } } else { nr = PTR_ERR(p); -- cgit v1.2.3 From 57b59c4a1400fa6c34764eab2e35a8762dc05a09 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:13 -0800 Subject: coredump_wait: don't call complete_vfork_done() Now that CLONE_VFORK is killable, coredump_wait() no longer needs complete_vfork_done(). zap_threads() should find and kill all tasks with the same ->mm, this includes our parent if ->vfork_done is set. mm_release() becomes the only caller, unexport complete_vfork_done(). Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 14 ++------------ include/linux/sched.h | 1 - kernel/fork.c | 2 +- 3 files changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index dccdcec913e9..153dee14fe55 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1926,19 +1926,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state) core_waiters = zap_threads(tsk, mm, core_state, exit_code); up_write(&mm->mmap_sem); - if (unlikely(core_waiters < 0)) - goto fail; - - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (tsk->vfork_done) - complete_vfork_done(tsk); - - if (core_waiters) + if (core_waiters > 0) wait_for_completion(&core_state->startup); -fail: + return core_waiters; } diff --git a/include/linux/sched.h b/include/linux/sched.h index b6467711f12e..11fcafaf4ae4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2291,7 +2291,6 @@ extern int do_execve(const char *, const char __user * const __user *, const char __user * const __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); -extern void complete_vfork_done(struct task_struct *tsk); struct task_struct *fork_idle(int); extern void set_task_comm(struct task_struct *tsk, char *from); diff --git a/kernel/fork.c b/kernel/fork.c index 892c534ce6e3..44b0e21af50e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -668,7 +668,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return mm; } -void complete_vfork_done(struct task_struct *tsk) +static void complete_vfork_done(struct task_struct *tsk) { struct completion *vfork; -- cgit v1.2.3 From 6e27f63edbd7ab893258e16500171dd1270a1369 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:14 -0800 Subject: vfork: kill PF_STARTING Previously it was (ab)used by utrace. Then it was wrongly used by the scheduler code. Currently it is not used, kill it before it finds the new erroneous user. Signed-off-by: Oleg Nesterov Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 - kernel/fork.c | 9 --------- 2 files changed, 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 11fcafaf4ae4..0657368bd78f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1777,7 +1777,6 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t * /* * Per process flags */ -#define PF_STARTING 0x00000002 /* being created */ #define PF_EXITING 0x00000004 /* getting shut down */ #define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ #define PF_VCPU 0x00000010 /* I'm a virtual CPU */ diff --git a/kernel/fork.c b/kernel/fork.c index 44b0e21af50e..26a7a6707fa7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1046,7 +1046,6 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p) new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); new_flags |= PF_FORKNOEXEC; - new_flags |= PF_STARTING; p->flags = new_flags; } @@ -1579,14 +1578,6 @@ long do_fork(unsigned long clone_flags, get_task_struct(p); } - /* - * We set PF_STARTING at creation in case tracing wants to - * use this to distinguish a fully live task from one that - * hasn't finished SIGSTOP raising yet. Now we clear it - * and set the child going. - */ - p->flags &= ~PF_STARTING; - wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ -- cgit v1.2.3 From 6027ce497d44dd8eae1a9215789df178f6b422cc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 5 Mar 2012 14:59:14 -0800 Subject: hung_task: fix the broken rcu_lock_break() logic check_hung_uninterruptible_tasks()->rcu_lock_break() introduced by "softlockup: check all tasks in hung_task" commit ce9dbe24 looks absolutely wrong. - rcu_lock_break() does put_task_struct(). If the task has exited it is not safe to even read its ->state, nothing protects this task_struct. - The TASK_DEAD checks are wrong too. Contrary to the comment, we can't use it to check if the task was unhashed. It can be unhashed without TASK_DEAD, or it can be valid with TASK_DEAD. For example, an autoreaping task can do release_task(current) long before it sets TASK_DEAD in do_exit(). Or, a zombie task can have ->state == TASK_DEAD but release_task() was not called, and in this case we must not break the loop. Change this code to check pid_alive() instead, and do this before we drop the reference to the task_struct. Note: while_each_thread() under rcu_read_lock() is not really safe, it can livelock. This will be fixed later, but fortunately in this case the "max_count" logic saves us anyway. Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Acked-by: Mandeep Singh Baines Acked-by: Paul E. McKenney Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 2e48ec0c2e91..c21449f85a2a 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -119,15 +119,20 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * For preemptible RCU it is sufficient to call rcu_read_unlock in order * to exit the grace period. For classic RCU, a reschedule is required. */ -static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) { + bool can_cont; + get_task_struct(g); get_task_struct(t); rcu_read_unlock(); cond_resched(); rcu_read_lock(); + can_cont = pid_alive(g) && pid_alive(t); put_task_struct(t); put_task_struct(g); + + return can_cont; } /* @@ -154,9 +159,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; if (!--batch_count) { batch_count = HUNG_TASK_BATCHING; - rcu_lock_break(g, t); - /* Exit if t or g was unhashed during refresh. */ - if (t->state == TASK_DEAD || g->state == TASK_DEAD) + if (!rcu_lock_break(g, t)) goto unlock; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ -- cgit v1.2.3 From a09b659cd68c10ec6a30cb91ebd2c327fcd5bfe5 Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 5 Mar 2012 15:07:25 -0800 Subject: genirq: Fix long-term regression in genirq irq_set_irq_type() handling In 2008, commit 0c5d1eb77a8be ("genirq: record trigger type") modified the way set_irq_type() handles the 'no trigger' condition. However, this has an adverse effect on PCMCIA support on Intel StrongARM and probably PXA platforms. PCMCIA has several status signals on the socket which can trigger interrupts; some of these status signals depend on the card's mode (whether it is configured in memory or IO mode). For example, cards have a 'Ready/IRQ' signal: in memory mode, this provides an indication to PCMCIA that the card has finished its power up initialization. In IO mode, it provides the device interrupt signal. Other status signals switch between on-board battery status and loud speaker output. In classical PCMCIA implementations, where you have a specific socket controller, the controller provides a method to mask interrupts from the socket, and importantly ignore any state transitions on the pins which correspond with interrupts once masked. This masking prevents unwanted events caused by the removal and application of socket power being forwarded. However, on platforms where there is no socket controller, the PCMCIA status and interrupt signals are routed to standard edge-triggered GPIOs. These GPIOs can be configured to interrupt on rising edge, falling edge, or never. This is where the problems start. Edge triggered interrupts are required to record events while disabled via the usual methods of {free,request,disable,enable}_irq() to prevent problems with dropped interrupts (eg, the 8390 driver uses disable_irq() to defer the delivery of interrupts). As a result, these interfaces can not be used to implement the desired behaviour. The side effect of this is that if the 'Ready/IRQ' GPIO is disabled via disable_irq() on suspend, and enabled via enable_irq() after resume, we will record the state transitions caused by powering events as valid interrupts, and foward them to the card driver, which may attempt to access a card which is not powered up. This leads delays resume while drivers spin in their interrupt handlers, and complaints from drivers before they realize what's happened. Moreover, in the case of the 'Ready/IRQ' signal, this is requested and freed by the card driver itself; the PCMCIA core has no idea whether the interrupt is requested, and, therefore, whether a call to disable_irq() would be valid. (We tried this around 2.4.17 / 2.5.1 kernel era, and ended up throwing it out because of this problem.) Therefore, it was decided back in around 2002 to disable the edge triggering instead, resulting in all state transitions on the GPIO being ignored. That's what we actually need the hardware to do. The commit above changes this behaviour; it explicitly prevents the 'no trigger' state being selected. The reason that request_irq() does not accept the 'no trigger' state is for compatibility with existing drivers which do not provide their desired triggering configuration. The set_irq_type() function is 'new' and not used by non-trigger aware drivers. Therefore, revert this change, and restore previously working platforms back to their former state. Signed-off-by: Russell King Cc: linux@arm.linux.org.uk Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f7c543a801d9..4dbc9c9d641b 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -61,8 +61,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type) return -EINVAL; type &= IRQ_TYPE_SENSE_MASK; - if (type != IRQ_TYPE_NONE) - ret = __irq_set_trigger(desc, irq, type); + ret = __irq_set_trigger(desc, irq, type); irq_put_desc_busunlock(desc, flags); return ret; } -- cgit v1.2.3 From b2a00178614e2cdd981a708d22a05c1ce4eadfd7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 5 Mar 2012 15:07:25 -0800 Subject: softirq: Reduce invoke_softirq() code duplication The two invoke_softirq() variants are identical except for a single line. So move the #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED inside one of the functions and get rid of the other one. Signed-off-by: Heiko Carstens Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/softirq.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 4eb3a0fa351e..c82d95a022ef 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -310,31 +310,21 @@ void irq_enter(void) __irq_enter(); } -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED static inline void invoke_softirq(void) { - if (!force_irqthreads) + if (!force_irqthreads) { +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED __do_softirq(); - else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); - wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } -} #else -static inline void invoke_softirq(void) -{ - if (!force_irqthreads) do_softirq(); - else { +#endif + } else { __local_bh_disable((unsigned long)__builtin_return_address(0), SOFTIRQ_OFFSET); wakeup_softirqd(); __local_bh_enable(SOFTIRQ_OFFSET); } } -#endif /* * Exit an interrupt context. Process softirqs if needed and possible: -- cgit v1.2.3 From 52abb700e16a9aa4cbc03f3d7f80206cbbc80680 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 6 Mar 2012 23:18:54 +0100 Subject: genirq: Clear action->thread_mask if IRQ_ONESHOT is not set Xommit ac5637611(genirq: Unmask oneshot irqs when thread was not woken) fails to unmask when a !IRQ_ONESHOT threaded handler is handled by handle_level_irq. This happens because thread_mask is or'ed unconditionally in irq_wake_thread(), but for !IRQ_ONESHOT interrupts never cleared. So the check for !desc->thread_active fails and keeps the interrupt disabled. Keep the thread_mask zero for !IRQ_ONESHOT interrupts. Document the thread_mask magic while at it. Reported-and-tested-by: Sven Joachim Reported-and-tested-by: Stefan Lippers-Hollmann Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 32313c084442..0f0d4704ddd8 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -985,6 +985,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) /* add new interrupt at end of irq queue */ do { + /* + * Or all existing action->thread_mask bits, + * so we can find the next zero bit for this + * new action. + */ thread_mask |= old->thread_mask; old_ptr = &old->next; old = *old_ptr; @@ -993,14 +998,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* - * Setup the thread mask for this irqaction. Unlikely to have - * 32 resp 64 irqs sharing one line, but who knows. + * Setup the thread mask for this irqaction for ONESHOT. For + * !ONESHOT irqs the thread mask is 0 so we can avoid a + * conditional in irq_wake_thread(). */ - if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) { - ret = -EBUSY; - goto out_mask; + if (new->flags & IRQF_ONESHOT) { + /* + * Unlikely to have 32 resp 64 irqs sharing one line, + * but who knows. + */ + if (thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + /* + * The thread_mask for the action is or'ed to + * desc->thread_active to indicate that the + * IRQF_ONESHOT thread handler has been woken, but not + * yet finished. The bit is cleared when a thread + * completes. When all threads of a shared interrupt + * line have completed desc->threads_active becomes + * zero and the interrupt line is unmasked. See + * handle.c:irq_wake_thread() for further information. + * + * If no thread is woken by primary (hard irq context) + * interrupt handlers, then desc->threads_active is + * also checked for zero to unmask the irq line in the + * affected hard irq flow handlers + * (handle_[fasteoi|level]_irq). + * + * The new action gets the first zero bit of + * thread_mask assigned. See the loop above which or's + * all existing action->thread_mask bits. + */ + new->thread_mask = 1 << ffz(thread_mask); } - new->thread_mask = 1 << ffz(thread_mask); if (!shared) { init_waitqueue_head(&desc->wait_for_threads); -- cgit v1.2.3 From 4293f20c19f44ca66e5ac836b411d25e14b9f185 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 7 Mar 2012 08:21:19 -0800 Subject: Revert "CPU hotplug, cpusets, suspend: Don't touch cpusets during suspend/resume" This reverts commit 8f2f748b0656257153bcf0941df8d6060acc5ca6. It causes some odd regression that we have not figured out, and it's too late in the -rc series to try to figure it out now. As reported by Konstantin Khlebnikov, it causes consistent hangs on his laptop (Thinkpad x220: 2x cores + HT). They can be avoided by adding calls to "rebuild_sched_domains();" in cpuset_cpu_[in]active() for the CPU_{ONLINE/DOWN_FAILED/DOWN_PREPARE}_FROZEN cases, but it's not at all clear why, and it makes no sense. Konstantin's config doesn't even have CONFIG_CPUSETS enabled, just to make things even more interesting. So it's not the cpusets, it's just the scheduling domains. So until this is understood, revert. Bisected-reported-and-tested-by: Konstantin Khlebnikov Acked-by: Peter Zijlstra Acked-by: Ingo Molnar Acked-by: Srivatsa S. Bhat Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 33a0676ea744..b342f57879e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6728,7 +6728,7 @@ int __init sched_create_sysfs_power_savings_entries(struct device *dev) static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: case CPU_DOWN_FAILED: cpuset_update_active_cpus(); @@ -6741,7 +6741,7 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: cpuset_update_active_cpus(); return NOTIFY_OK; -- cgit v1.2.3 From 540b60e24f3f4781d80e47122f0c4486a03375b8 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Mar 2012 14:59:13 +0100 Subject: genirq: Fix incorrect check for forced IRQ thread handler We do not want a bitwise AND between boolean operands Signed-off-by: Alexander Gordeev Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/20120309135912.GA2114@dhcp-26-207.brq.redhat.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a9a9dbe49fea..c0730ad8a117 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -773,7 +773,7 @@ static int irq_thread(void *data) struct irqaction *action); int wake; - if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD, + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; else -- cgit v1.2.3 From 4bcdf1d0b652bc33d52f2322b77463e4dc58abf8 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Mar 2012 14:59:26 +0100 Subject: genirq: Get rid of unnecessary irqaction field in task_struct When a new thread handler is created, an irqaction is passed to it as data. Not only that irqaction is stored in task_struct by the handler for later use, but also a structure associated with the kernel thread keeps this value as long as the thread exists. This fix kicks irqaction out off task_struct. Yes, I introduce new bit field. But it allows not only to eliminate the duplicate, but also shortens size of task_struct. Reported-by: Oleg Nesterov Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120309135925.GB2114@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 10 +++++----- kernel/irq/manage.c | 19 +++++++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 7d379a6bfd88..07f537a371ce 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1319,6 +1319,11 @@ struct task_struct { unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; +#ifdef CONFIG_GENERIC_HARDIRQS + /* IRQ handler threads */ + unsigned irq_thread:1; +#endif + pid_t pid; pid_t tgid; @@ -1427,11 +1432,6 @@ struct task_struct { * mempolicy */ spinlock_t alloc_lock; -#ifdef CONFIG_GENERIC_HARDIRQS - /* IRQ handler threads */ - struct irqaction *irqaction; -#endif - /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c0730ad8a117..0fa3ce998ecb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -780,7 +780,7 @@ static int irq_thread(void *data) handler_fn = irq_thread_fn; sched_setscheduler(current, SCHED_FIFO, ¶m); - current->irqaction = action; + current->irq_thread = 1; while (!irq_wait_for_interrupt(action)) { @@ -818,10 +818,10 @@ static int irq_thread(void *data) irq_finalize_oneshot(desc, action, true); /* - * Clear irqaction. Otherwise exit_irq_thread() would make + * Clear irq_thread. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. */ - current->irqaction = NULL; + current->irq_thread = 0; return 0; } @@ -832,27 +832,30 @@ void exit_irq_thread(void) { struct task_struct *tsk = current; struct irq_desc *desc; + struct irqaction *action; - if (!tsk->irqaction) + if (!tsk->irq_thread) return; + action = kthread_data(tsk); + printk(KERN_ERR "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", - tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + tsk->comm ? tsk->comm : "", tsk->pid, action->irq); - desc = irq_to_desc(tsk->irqaction->irq); + desc = irq_to_desc(action->irq); /* * Prevent a stale desc->threads_oneshot. Must be called * before setting the IRQTF_DIED flag. */ - irq_finalize_oneshot(desc, tsk->irqaction, true); + irq_finalize_oneshot(desc, action, true); /* * Set the THREAD DIED flag to prevent further wakeups of the * soon to be gone threaded handler. */ - set_bit(IRQTF_DIED, &tsk->irqaction->flags); + set_bit(IRQTF_DIED, &action->flags); } static void irq_setup_forced_threading(struct irqaction *new) -- cgit v1.2.3 From 05d74efa3c72a5c40b0edeb15856c0230126313b Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Mar 2012 14:59:40 +0100 Subject: genirq: No need to check IRQTF_DIED before stopping a thread handler Since 63706172f332fd3f6e7458ebfb35fa6de9c21dc5 kthread_stop() is not afraid of dead kernel threads. So no need to check if a thread is alive before stopping it. These checks still were racy. Reported-by: Oleg Nesterov Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120309135939.GC2114@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0fa3ce998ecb..3feab4ab6877 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1106,8 +1106,7 @@ out_thread: struct task_struct *t = new->thread; new->thread = NULL; - if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) - kthread_stop(t); + kthread_stop(t); put_task_struct(t); } out_mput: @@ -1217,8 +1216,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif if (action->thread) { - if (!test_bit(IRQTF_DIED, &action->thread_flags)) - kthread_stop(action->thread); + kthread_stop(action->thread); put_task_struct(action->thread); } -- cgit v1.2.3 From 5234ffb9f74cfa8993d174782bc861dd9b7b5bfb Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Fri, 9 Mar 2012 14:59:59 +0100 Subject: genirq: Get rid of unnecessary IRQTF_DIED flag Currently IRQTF_DIED flag is set when a IRQ thread handler calls do_exit() But also PF_EXITING per process flag gets set when a thread exits. This fix eliminates the duplicate by using PF_EXITING flag. Also, there is a race condition in exit_irq_thread(). In case a thread's bit is cleared in desc->threads_oneshot (and the IRQ line gets unmasked), but before IRQTF_DIED flag is set, a new interrupt might come in and set just cleared bit again, this time forever. This fix throws IRQTF_DIED flag away, eliminating the race as a result. [ tglx: Test THREAD_EXITING first as suggested by Oleg ] Reported-by: Oleg Nesterov Signed-off-by: Alexander Gordeev Link: http://lkml.kernel.org/r/20120309135958.GD2114@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- kernel/exit.c | 4 ++-- kernel/irq/handle.c | 2 +- kernel/irq/internals.h | 2 -- kernel/irq/manage.c | 11 +---------- 4 files changed, 4 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 4b4042f9bc6a..752d2c0abd19 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -935,8 +935,6 @@ void do_exit(long code) schedule(); } - exit_irq_thread(); - exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against @@ -945,6 +943,8 @@ void do_exit(long code) smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); + exit_irq_thread(); + if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, task_pid_nr(current), diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 470d08c82bbe..500aaf67c546 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -60,7 +60,7 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) * device interrupt, so no irq storm is lurking. If the * RUNTHREAD bit is already set, nothing to do. */ - if (test_bit(IRQTF_DIED, &action->thread_flags) || + if ((action->thread->flags & PF_EXITING) || test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) return; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b7952316016a..5c233e0ea2c3 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -20,14 +20,12 @@ extern bool noirqdebug; /* * Bits used by threaded handlers: * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run - * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed * IRQTF_AFFINITY - irq thread is requested to adjust affinity * IRQTF_FORCED_THREAD - irq action is force threaded */ enum { IRQTF_RUNTHREAD, - IRQTF_DIED, IRQTF_WARNED, IRQTF_AFFINITY, IRQTF_FORCED_THREAD, diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3feab4ab6877..a94466db73b2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -845,17 +845,8 @@ void exit_irq_thread(void) desc = irq_to_desc(action->irq); - /* - * Prevent a stale desc->threads_oneshot. Must be called - * before setting the IRQTF_DIED flag. - */ + /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action, true); - - /* - * Set the THREAD DIED flag to prevent further wakeups of the - * soon to be gone threaded handler. - */ - set_bit(IRQTF_DIED, &action->flags); } static void irq_setup_forced_threading(struct irqaction *new) -- cgit v1.2.3 From e06ffa1ede4146cbc261d90f5dff3d63fe2e9d7a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 9 Mar 2012 18:03:20 +0800 Subject: workqueue: use percpu allocator for cwq on UP I notice that the commit bbddff makes percpu allocator can work on UP, So we don't need the magic way for UP. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index bec7b5b53e03..5bbba094bfac 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -474,13 +474,8 @@ static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, struct workqueue_struct *wq) { if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) { -#ifdef CONFIG_SMP + if (likely(cpu < nr_cpu_ids)) return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); -#else - return wq->cpu_wq.single; -#endif - } } else if (likely(cpu == WORK_CPU_UNBOUND)) return wq->cpu_wq.single; return NULL; @@ -2897,13 +2892,8 @@ static int alloc_cwqs(struct workqueue_struct *wq) const size_t size = sizeof(struct cpu_workqueue_struct); const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, __alignof__(unsigned long long)); -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) wq->cpu_wq.pcpu = __alloc_percpu(size, align); else { void *ptr; @@ -2927,13 +2917,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) static void free_cwqs(struct workqueue_struct *wq) { -#ifdef CONFIG_SMP - bool percpu = !(wq->flags & WQ_UNBOUND); -#else - bool percpu = false; -#endif - - if (percpu) + if (!(wq->flags & WQ_UNBOUND)) free_percpu(wq->cpu_wq.pcpu); else if (wq->cpu_wq.single) { /* the pointer to free is stored right after the cwq */ -- cgit v1.2.3 From 5d6523ebd2f67de9d23285aad7f3910e7b0aee83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 10 Mar 2012 00:07:36 +0100 Subject: sched: Fix load-balance wreckage Commit 367456c ("sched: Ditch per cgroup task lists for load-balancing") completely wrecked load-balancing due to a few silly mistakes. Correct those and remove more pointless code. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-zk04ihygwxn7qqrlpaf73b0r@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 110 +++++++++++++++++++--------------------------------- 1 file changed, 39 insertions(+), 71 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a0424fc4cc54..def17aa302d5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_load_add(&rq_of(cfs_rq)->load, se->load.weight); #ifdef CONFIG_SMP if (entity_is_task(se)) - list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); + list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); #endif cfs_rq->nr_running++; } @@ -3071,7 +3071,6 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; #define LBF_ALL_PINNED 0x01 #define LBF_NEED_BREAK 0x02 -#define LBF_ABORT 0x04 struct lb_env { struct sched_domain *sd; @@ -3083,7 +3082,7 @@ struct lb_env { struct rq *dst_rq; enum cpu_idle_type idle; - unsigned long max_load_move; + long load_move; unsigned int flags; unsigned int loop; @@ -3216,39 +3215,47 @@ static int move_one_task(struct lb_env *env) static unsigned long task_h_load(struct task_struct *p); -static unsigned long balance_tasks(struct lb_env *env) +/* + * move_tasks tries to move up to load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct lb_env *env) { - long rem_load_move = env->max_load_move; - struct task_struct *p, *n; + struct list_head *tasks = &env->src_rq->cfs_tasks; + struct task_struct *p; unsigned long load; int pulled = 0; - if (env->max_load_move == 0) - goto out; + if (env->load_move <= 0) + return 0; + + while (!list_empty(tasks)) { + p = list_first_entry(tasks, struct task_struct, se.group_node); - list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { env->loop++; /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) { - env->flags |= LBF_ABORT; + if (env->loop > env->loop_max) break; - } - /* take a beather every nr_migrate tasks */ + + /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { env->loop_break += sysctl_sched_nr_migrate; env->flags |= LBF_NEED_BREAK; break; } - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, - env->dst_cpu)) + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) goto next; load = task_h_load(p); + if (load < 16 && !env->sd->nr_balance_failed) goto next; - if ((load * 2) > rem_load_move) + if ((load / 2) > env->load_move) goto next; if (!can_migrate_task(p, env)) @@ -3256,7 +3263,7 @@ static unsigned long balance_tasks(struct lb_env *env) move_task(p, env); pulled++; - rem_load_move -= load; + env->load_move -= load; #ifdef CONFIG_PREEMPT /* @@ -3264,24 +3271,22 @@ static unsigned long balance_tasks(struct lb_env *env) * kernels will stop after the first task is pulled to minimize * the critical section. */ - if (env->idle == CPU_NEWLY_IDLE) { - env->flags |= LBF_ABORT; + if (env->idle == CPU_NEWLY_IDLE) break; - } #endif /* * We only want to steal up to the prescribed amount of * weighted load. */ - if (rem_load_move <= 0) + if (env->load_move <= 0) break; continue; next: - list_move_tail(&p->se.group_node, &env->src_rq->cfs_tasks); + list_move_tail(&p->se.group_node, tasks); } -out: + /* * Right now, this is one of only two places move_task() is called, * so we can safely collect move_task() stats here rather than @@ -3289,7 +3294,7 @@ out: */ schedstat_add(env->sd, lb_gained[env->idle], pulled); - return env->max_load_move - rem_load_move; + return pulled; } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3399,43 +3404,6 @@ static unsigned long task_h_load(struct task_struct *p) } #endif -/* - * move_tasks tries to move up to max_load_move weighted load from busiest to - * this_rq, as part of a balancing operation within domain "sd". - * Returns 1 if successful and 0 otherwise. - * - * Called with both runqueues locked. - */ -static int move_tasks(struct lb_env *env) -{ - unsigned long max_load_move = env->max_load_move; - unsigned long total_load_moved = 0, load_moved; - - update_h_load(cpu_of(env->src_rq)); - do { - env->max_load_move = max_load_move - total_load_moved; - load_moved = balance_tasks(env); - total_load_moved += load_moved; - - if (env->flags & (LBF_NEED_BREAK|LBF_ABORT)) - break; - -#ifdef CONFIG_PREEMPT - /* - * NEWIDLE balancing is a source of latency, so preemptible - * kernels will stop after the first task is pulled to minimize - * the critical section. - */ - if (env->idle == CPU_NEWLY_IDLE && env->dst_rq->nr_running) { - env->flags |= LBF_ABORT; - break; - } -#endif - } while (load_moved && max_load_move > total_load_moved); - - return total_load_moved > 0; -} - /********** Helpers for find_busiest_group ************************/ /* * sd_lb_stats - Structure to store the statistics of a sched_domain @@ -4477,31 +4445,31 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.max_load_move = imbalance; + env.load_move = imbalance; env.src_cpu = busiest->cpu; env.src_rq = busiest; env.loop_max = busiest->nr_running; +more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); - ld_moved = move_tasks(&env); + if (!env.loop) + update_h_load(env.src_cpu); + ld_moved += move_tasks(&env); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * some other cpu did the load balance for us. */ if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); - if (env.flags & LBF_ABORT) - goto out_balanced; - - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto redo; - } - /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); -- cgit v1.2.3 From 5fbd036b552f633abb394a319f7c62a5c86a9cd7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Dec 2011 17:09:22 +0100 Subject: sched: Cleanup cpu_active madness Stepan found: CPU0 CPUn _cpu_up() __cpu_up() boostrap() notify_cpu_starting() set_cpu_online() while (!cpu_active()) cpu_relax() smp_call_function(.wait=1) /* we find cpu_online() is true */ arch_send_call_function_ipi_mask() /* wait-forever-more */ local_irq_enable() cpu_notify(CPU_ONLINE) sched_cpu_active() set_cpu_active() Now the purpose of cpu_active is mostly with bringing down a cpu, where we mark it !active to avoid the load-balancer from moving tasks to it while we tear down the cpu. This is required because we only update the sched_domain tree after we brought the cpu-down. And this is needed so that some tasks can still run while we bring it down, we just don't want new tasks to appear. On cpu-up however the sched_domain tree doesn't yet include the new cpu, so its invisible to the load-balancer, regardless of the active state. So instead of setting the active state after we boot the new cpu (and consequently having to wait for it before enabling interrupts) set the cpu active before we set it online and avoid the whole mess. Reported-by: Stepan Moskovchenko Signed-off-by: Peter Zijlstra Acked-by: Thomas Gleixner Link: http://lkml.kernel.org/r/1323965362.18942.71.camel@twins Signed-off-by: Ingo Molnar --- arch/arm/kernel/smp.c | 7 ------- arch/hexagon/kernel/smp.c | 2 -- arch/s390/kernel/smp.c | 6 ------ arch/x86/kernel/smpboot.c | 13 ------------- kernel/sched/core.c | 2 +- 5 files changed, 1 insertion(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index cdeb727527d3..d616ed51e7a7 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -295,13 +295,6 @@ asmlinkage void __cpuinit secondary_start_kernel(void) */ percpu_timer_setup(); - while (!cpu_active(cpu)) - cpu_relax(); - - /* - * cpu_active bit is set, so it's safe to enalbe interrupts - * now. - */ local_irq_enable(); local_fiq_enable(); diff --git a/arch/hexagon/kernel/smp.c b/arch/hexagon/kernel/smp.c index c871a2cffaef..0123c63e9a3a 100644 --- a/arch/hexagon/kernel/smp.c +++ b/arch/hexagon/kernel/smp.c @@ -179,8 +179,6 @@ void __cpuinit start_secondary(void) printk(KERN_INFO "%s cpu %d\n", __func__, current_thread_info()->cpu); set_cpu_online(cpu, true); - while (!cpumask_test_cpu(cpu, cpu_active_mask)) - cpu_relax(); local_irq_enable(); cpu_idle(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 2398ce6b15ae..b0e28c47ab83 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -550,12 +550,6 @@ int __cpuinit start_secondary(void *cpuvoid) S390_lowcore.restart_psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; __ctl_set_bit(0, 28); /* Enable lowcore protection */ - /* - * Wait until the cpu which brought this one up marked it - * active before enabling interrupts. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); local_irq_enable(); /* cpu_idle will call schedule for us */ cpu_idle(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 66d250c00d11..58f78165d308 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -291,19 +291,6 @@ notrace static void __cpuinit start_secondary(void *unused) per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; x86_platform.nmi_init(); - /* - * Wait until the cpu which brought this one up marked it - * online before enabling interrupts. If we don't do that then - * we can end up waking up the softirq thread before this cpu - * reached the active state, which makes the scheduler unhappy - * and schedule the softirq thread on the wrong cpu. This is - * only observable with forced threaded interrupts, but in - * theory it could also happen w/o them. It's just way harder - * to achieve. - */ - while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask)) - cpu_relax(); - /* enable local interrupts */ local_irq_enable(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95545126be1c..b1ccce819ce2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5410,7 +5410,7 @@ static int __cpuinit sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: + case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; -- cgit v1.2.3 From 554cecaf733623b327eef9652b65965eb1081b81 Mon Sep 17 00:00:00 2001 From: Diwakar Tundlam Date: Wed, 7 Mar 2012 14:44:26 -0800 Subject: sched/nohz: Correctly initialize 'next_balance' in 'nohz' idle balancer The 'next_balance' field of 'nohz' idle balancer must be initialized to jiffies. Since jiffies is initialized to negative 300 seconds the 'nohz' idle balancer does not run for the first 300s (5mins) after bootup. If no new processes are spawed or no idle cycles happen, the load on the cpus will remain unbalanced for that duration. Signed-off-by: Diwakar Tundlam Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1DD7BFEDD3147247B1355BEFEFE4665237994F30EF@HQMAIL04.nvidia.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index def17aa302d5..11f3979bad2a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5571,6 +5571,7 @@ __init void init_sched_fair_class(void) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ + nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif -- cgit v1.2.3 From 3ccf3e8306156a28213adc720aba807e9a901ad5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Feb 2012 10:47:00 +0100 Subject: printk/sched: Introduce special printk_sched() for those awkward moments There's a few awkward printk()s inside of scheduler guts that people prefer to keep but really are rather deadlock prone. Fudge around it by storing the text in a per-cpu buffer and poll it using the existing printk_tick() handler. This will drop output when its more frequent than once a tick, however only the affinity thing could possible go that fast and for that just one should suffice to notify the admin he's done something silly.. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/n/tip-wua3lmkt3dg8nfts66o6brne@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/printk.h | 10 ++++++++++ kernel/printk.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/sched/core.c | 2 +- kernel/sched/rt.c | 8 +++++++- 4 files changed, 55 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/printk.h b/include/linux/printk.h index f0e22f75143f..1f77a4174ee0 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -100,6 +100,11 @@ int vprintk(const char *fmt, va_list args); asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); +/* + * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + */ +__printf(1, 2) __cold int printk_sched(const char *fmt, ...); + /* * Please don't use printk_ratelimit(), because it shares ratelimiting state * with all other unrelated printk_ratelimit() callsites. Instead use @@ -127,6 +132,11 @@ int printk(const char *s, ...) { return 0; } +static inline __printf(1, 2) __cold +int printk_sched(const char *s, ...) +{ + return 0; +} static inline int printk_ratelimit(void) { return 0; diff --git a/kernel/printk.c b/kernel/printk.c index 13c0a1143f49..7ca7ba591e21 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1208,13 +1208,47 @@ int is_console_locked(void) return console_locked; } +/* + * Delayed printk facility, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + local_irq_restore(flags); + + return r; +} void printk_tick(void) { if (__this_cpu_read(printk_pending)) { - __this_cpu_write(printk_pending, 0); - wake_up_interruptible(&log_wait); + int pending = __this_cpu_xchg(printk_pending, 0); + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); } } @@ -1228,7 +1262,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { if (waitqueue_active(&log_wait)) - this_cpu_write(printk_pending, 1); + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); } /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b1ccce819ce2..8781cec7c3e6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1284,7 +1284,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + printk_sched("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7f7e7cdcb472..b60dad720173 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -864,8 +864,14 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { + static bool once = false; + rt_rq->rt_throttled = 1; - printk_once(KERN_WARNING "sched: RT throttling activated\n"); + + if (!once) { + once = true; + printk_sched("sched: RT throttling activated\n"); + } } else { /* * In case we did anyway, make it go away, -- cgit v1.2.3 From 8e3fabfde445a872c8aec2296846badf24d7c8b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Mar 2012 18:54:26 +0100 Subject: sched: Update yield() docs Suggested-by: Joe Perches Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1331056466.11248.327.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8781cec7c3e6..47614a5cdd47 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4577,8 +4577,24 @@ EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * This is a shortcut for kernel-space yielding - it marks the - * thread runnable and calls sys_sched_yield(). + * Do not ever use this function, there's a 99% chance you're doing it wrong. + * + * The scheduler is at all times free to pick the calling task as the most + * eligible task to run, if removing the yield() call from your code breaks + * it, its already broken. + * + * Typical broken usage is: + * + * while (!event) + * yield(); + * + * where one assumes that yield() will let 'the other' process run that will + * make event true. If the current task is a SCHED_FIFO task that will never + * happen. Never use yield() as a progress guarantee!! + * + * If you want to use yield() to wait for something, use wait_event(). + * If you want to use yield() to be 'nice' for others, use cond_resched(). + * If you still want to use yield(), do not! */ void __sched yield(void) { -- cgit v1.2.3 From c308b56b5398779cd3da0f62ab26b0453494c3d4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Mar 2012 15:04:46 +0100 Subject: sched: Fix nohz load accounting -- again! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Various people reported nohz load tracking still being wrecked, but Doug spotted the actual problem. We fold the nohz remainder in too soon, causing us to loose samples and under-account. So instead of playing catch-up up-front, always do a single load-fold with whatever state we encounter and only then fold the nohz remainder and play catch-up. Reported-by: Doug Smythies Reported-by: LesÃ…=82aw Kope=C4=87 Reported-by: Aman Gupta Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-4v31etnhgg9kwd6ocgx3rxl8@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 47614a5cdd47..e3ccc13c4caa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2266,13 +2266,10 @@ calc_load_n(unsigned long load, unsigned long exp, * Once we've updated the global active value, we need to apply the exponential * weights adjusted to the number of cycles missed. */ -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { long delta, active, n; - if (time_before(jiffies, calc_load_update)) - return; - /* * If we crossed a calc_load_update boundary, make sure to fold * any pending idle changes, the respective CPUs might have @@ -2284,31 +2281,25 @@ static void calc_global_nohz(unsigned long ticks) atomic_long_add(delta, &calc_load_tasks); /* - * If we were idle for multiple load cycles, apply them. + * It could be the one fold was all it took, we done! */ - if (ticks >= LOAD_FREQ) { - n = ticks / LOAD_FREQ; + if (time_before(jiffies, calc_load_update + 10)) + return; - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - calc_load_update += n * LOAD_FREQ; - } + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - /* - * Its possible the remainder of the above division also crosses - * a LOAD_FREQ period, the regular check in calc_global_load() - * which comes after this will take care of that. - * - * Consider us being 11 ticks before a cycle completion, and us - * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will - * age us 4 cycles, and the test in calc_global_load() will - * pick up the final one. - */ + calc_load_update += n * LOAD_FREQ; } #else void calc_load_account_idle(struct rq *this_rq) @@ -2320,7 +2311,7 @@ static inline long calc_load_fold_idle(void) return 0; } -static void calc_global_nohz(unsigned long ticks) +static void calc_global_nohz(void) { } #endif @@ -2348,8 +2339,6 @@ void calc_global_load(unsigned long ticks) { long active; - calc_global_nohz(ticks); - if (time_before(jiffies, calc_load_update + 10)) return; @@ -2361,6 +2350,16 @@ void calc_global_load(unsigned long ticks) avenrun[2] = calc_load(avenrun[2], EXP_15, active); calc_load_update += LOAD_FREQ; + + /* + * Account one period with whatever state we found before + * folding in the nohz state and ageing the entire idle period. + * + * This avoids loosing a sample when we go idle between + * calc_load_account_active() (10 ticks ago) and now and thus + * under-accounting. + */ + calc_global_nohz(); } /* -- cgit v1.2.3 From 01f23e1630d944f7085cd8fd5793e31ea91c03d8 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Sun, 27 Nov 2011 21:43:10 +0000 Subject: sched/arch: Introduce the finish_arch_post_lock_switch() scheduler callback This callback is called by the scheduler after rq->lock has been released and interrupts enabled. It will be used in subsequent patches on the ARM architecture. Signed-off-by: Catalin Marinas Reviewed-by: Will Deacon Reviewed-by: Frank Rowand Tested-by: Will Deacon Tested-by: Marc Zyngier Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/n/20120313110840.7b444deb6b1bb902c15f3cdf@canb.auug.org.au Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + kernel/sched/sched.h | 3 +++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b342f57879e6..423f40f32a59 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1932,6 +1932,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); + finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 98c0c2623db8..d72483d07c9f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -692,6 +692,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p) #ifndef finish_arch_switch # define finish_arch_switch(prev) do { } while (0) #endif +#ifndef finish_arch_post_lock_switch +# define finish_arch_post_lock_switch() do { } while (0) +#endif #ifndef __ARCH_WANT_UNLOCKED_CTXSW static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -- cgit v1.2.3 From db6544e0075d192e5ad16eda8689c55fa9c6f8f4 Mon Sep 17 00:00:00 2001 From: Rajesh Bhagat Date: Fri, 17 Feb 2012 13:59:15 +0530 Subject: ftrace: Fix function_graph for archs that test ftrace_trace_function When CONFIG_DYNAMIC_FTRACE is not set, some archs (ARM) test the variable function_trace_function to determine if it should call the function tracer. If it is not set to ftrace_stub, then it will call the function and return, and not call the function graph tracer. But some of these archs (ARM) do not have the assembly code to test if function tracing is enabled or not (quick stop of tracing) and it calls the helper routine ftrace_test_stop_func() instead. If function tracer is enabled and then disabled, the variable ftrace_trace_function is still set to the helper routine ftrace_test_stop_func(), and not to ftrace_stub. This will prevent the function graph tracer from ever running. Output before patch /debug/tracing # echo function > current_tracer /debug/tracing # echo function_graph > current_tracer /debug/tracing # cat trace Output after patch /debug/tracing # echo function > current_tracer /debug/tracing # echo function_graph > current_tracer /debug/tracing # cat trace 0) ! 253.375 us | } /* irq_enter */ 0) | generic_handle_irq() { 0) | handle_fasteoi_irq() { 0) 9.208 us | _raw_spin_lock(); 0) | handle_irq_event() { 0) | handle_irq_event_percpu() { Signed-off-by: Rajesh Bhagat Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 867bd1dd2dd0..0fa92f677c92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -249,7 +249,8 @@ static void update_ftrace_function(void) #else __ftrace_trace_function = func; #endif - ftrace_trace_function = ftrace_test_stop_func; + ftrace_trace_function = + (func == ftrace_stub) ? func : ftrace_test_stop_func; #endif } -- cgit v1.2.3 From fa73dc9400516945bcbae8d98c23393bcefe1440 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 28 Feb 2012 11:02:46 +0000 Subject: tracing: Fix build breakage without CONFIG_PERF_EVENTS Today's -next fails to build for me: CC kernel/trace/trace_export.o In file included from kernel/trace/trace_export.c:197: kernel/trace/trace_entries.h:58: error: 'perf_ftrace_event_register' undeclared here (not in a function) make[2]: *** [kernel/trace/trace_export.o] Error 1 make[1]: *** [kernel/trace] Error 2 make: *** [kernel] Error 2 because as of ced390 (ftrace, perf: Add support to use function tracepoint in perf) perf_trace_event_register() is declared in trace.h only if CONFIG_PERF_EVENTS is enabled but I don't have that set. Ensure that we always have a definition of perf_trace_event_register() by making the definition unconditional. Link: http://lkml.kernel.org/r/1330426967-17067-1-git-send-email-broonie@opensource.wolfsonmicro.com Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Signed-off-by: Mark Brown Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ce887c0eca56..95059f091a24 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -836,13 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[]; filter) #include "trace_entries.h" -#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_FUNCTION_TRACER int perf_ftrace_event_register(struct ftrace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL #endif /* CONFIG_FUNCTION_TRACER */ -#endif /* CONFIG_PERF_EVENTS */ #endif /* _LINUX_KERNEL_TRACE_H */ -- cgit v1.2.3 From 3047817b894ddae62be07787bc8735a616104398 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 9 Mar 2012 07:20:12 +0100 Subject: padata: Fix race in the serialization path When a padata object is queued to the serialization queue, another cpu might process and free the padata object. So don't dereference it after queueing to the serialization queue. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index b45259931512..aa9929545855 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -230,6 +230,7 @@ out: static void padata_reorder(struct parallel_data *pd) { + int cb_cpu; struct padata_priv *padata; struct padata_serial_queue *squeue; struct padata_instance *pinst = pd->pinst; @@ -270,13 +271,14 @@ static void padata_reorder(struct parallel_data *pd) return; } - squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); + cb_cpu = padata->cb_cpu; + squeue = per_cpu_ptr(pd->squeue, cb_cpu); spin_lock(&squeue->serial.lock); list_add_tail(&padata->list, &squeue->serial.list); spin_unlock(&squeue->serial.lock); - queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); + queue_work_on(cb_cpu, pinst->wq, &squeue->work); } spin_unlock_bh(&pd->lock); -- cgit v1.2.3 From 2dc9b5dbdef09840de852a4f0cc6a9c9eece7220 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 9 Mar 2012 07:20:49 +0100 Subject: padata: Fix race on sequence number wrap When padata_do_parallel() is called from multiple cpus for the same padata instance, we can get object reordering on sequence number wrap because testing for sequence number wrap and reseting the sequence number must happen atomically but is implemented with two atomic operations. This patch fixes this by converting the sequence number from atomic_t to an unsigned int and protect the access with a spin_lock. As a side effect, we get rid of the sequence number wrap handling because the seqence number wraps back to null now without the need to do anything. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- include/linux/padata.h | 6 ++---- kernel/padata.c | 38 ++++++++++---------------------------- 2 files changed, 12 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/padata.h b/include/linux/padata.h index 4633b2f726b6..86292beebfe2 100644 --- a/include/linux/padata.h +++ b/include/linux/padata.h @@ -46,7 +46,6 @@ struct padata_priv { struct list_head list; struct parallel_data *pd; int cb_cpu; - int seq_nr; int info; void (*parallel)(struct padata_priv *padata); void (*serial)(struct padata_priv *padata); @@ -116,7 +115,6 @@ struct padata_cpumask { * @pinst: padata instance. * @pqueue: percpu padata queues used for parallelization. * @squeue: percpu padata queues used for serialuzation. - * @seq_nr: The sequence number that will be attached to the next object. * @reorder_objects: Number of objects waiting in the reorder queues. * @refcnt: Number of objects holding a reference on this parallel_data. * @max_seq_nr: Maximal used sequence number. @@ -129,12 +127,12 @@ struct parallel_data { struct padata_instance *pinst; struct padata_parallel_queue __percpu *pqueue; struct padata_serial_queue __percpu *squeue; - atomic_t seq_nr; atomic_t reorder_objects; atomic_t refcnt; - unsigned int max_seq_nr; struct padata_cpumask cpumask; spinlock_t lock ____cacheline_aligned; + spinlock_t seq_lock; + unsigned int seq_nr; unsigned int processed; struct timer_list timer; }; diff --git a/kernel/padata.c b/kernel/padata.c index aa9929545855..6f10eb285ece 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -29,7 +29,6 @@ #include #include -#define MAX_SEQ_NR (INT_MAX - NR_CPUS) #define MAX_OBJ_NUM 1000 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) @@ -43,18 +42,19 @@ static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) return target_cpu; } -static int padata_cpu_hash(struct padata_priv *padata) +static int padata_cpu_hash(struct parallel_data *pd) { int cpu_index; - struct parallel_data *pd; - - pd = padata->pd; /* * Hash the sequence numbers to the cpus by taking * seq_nr mod. number of cpus in use. */ - cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); + + spin_lock(&pd->seq_lock); + cpu_index = pd->seq_nr % cpumask_weight(pd->cpumask.pcpu); + pd->seq_nr++; + spin_unlock(&pd->seq_lock); return padata_index_to_cpu(pd, cpu_index); } @@ -132,12 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst, padata->pd = pd; padata->cb_cpu = cb_cpu; - if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) - atomic_set(&pd->seq_nr, -1); - - padata->seq_nr = atomic_inc_return(&pd->seq_nr); - - target_cpu = padata_cpu_hash(padata); + target_cpu = padata_cpu_hash(pd); queue = per_cpu_ptr(pd->pqueue, target_cpu); spin_lock(&queue->parallel.lock); @@ -173,7 +168,7 @@ EXPORT_SYMBOL(padata_do_parallel); static struct padata_priv *padata_get_next(struct parallel_data *pd) { int cpu, num_cpus; - int next_nr, next_index; + unsigned int next_nr, next_index; struct padata_parallel_queue *queue, *next_queue; struct padata_priv *padata; struct padata_list *reorder; @@ -189,14 +184,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - if (unlikely(next_nr > pd->max_seq_nr)) { - next_nr = next_nr - pd->max_seq_nr - 1; - next_index = next_nr % num_cpus; - cpu = padata_index_to_cpu(pd, next_index); - next_queue = per_cpu_ptr(pd->pqueue, cpu); - pd->processed = 0; - } - padata = NULL; reorder = &next_queue->reorder; @@ -205,8 +192,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) padata = list_entry(reorder->list.next, struct padata_priv, list); - BUG_ON(next_nr != padata->seq_nr); - spin_lock(&reorder->lock); list_del_init(&padata->list); atomic_dec(&pd->reorder_objects); @@ -402,7 +387,7 @@ static void padata_init_squeues(struct parallel_data *pd) /* Initialize all percpu queues used by parallel workers */ static void padata_init_pqueues(struct parallel_data *pd) { - int cpu_index, num_cpus, cpu; + int cpu_index, cpu; struct padata_parallel_queue *pqueue; cpu_index = 0; @@ -417,9 +402,6 @@ static void padata_init_pqueues(struct parallel_data *pd) INIT_WORK(&pqueue->work, padata_parallel_worker); atomic_set(&pqueue->num_obj, 0); } - - num_cpus = cpumask_weight(pd->cpumask.pcpu); - pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; } /* Allocate and initialize the internal cpumask dependend resources. */ @@ -446,7 +428,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, padata_init_pqueues(pd); padata_init_squeues(pd); setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); - atomic_set(&pd->seq_nr, -1); + pd->seq_nr = 0; atomic_set(&pd->reorder_objects, 0); atomic_set(&pd->refcnt, 0); pd->pinst = pinst; -- cgit v1.2.3 From 7140ea1980f2fae9c7aaeac5f6b35317e1389ee6 Mon Sep 17 00:00:00 2001 From: Ido Yariv Date: Fri, 2 Dec 2011 18:24:12 +0200 Subject: genirq: Flush the irq thread on synchronization The current implementation does not always flush the threaded handler when disabling the irq. In case the irq handler was called, but the threaded handler hasn't started running yet, the interrupt will be flagged as pending, and the handler will not run. This implementation has some issues: First, if the interrupt is a wake source and flagged as pending, the system will not be able to suspend. Second, when quickly disabling and re-enabling the irq, the threaded handler might continue to run after the irq is re-enabled without the irq handler being called first. This might be an unexpected behavior. In addition, it might be counter-intuitive that the threaded handler will not be called even though the irq handler was called and returned IRQ_WAKE_THREAD. Fix this by always waiting for the threaded handler to complete in synchronize_irq(). [ tglx: Massaged comments, added WARN_ONs and the missing IRQTF_RUNTHREAD check in exit_irq_thread() ] Signed-off-by: Ido Yariv Link: http://lkml.kernel.org/r/1322843052-7166-1-git-send-email-ido@wizery.com Signed-off-by: Thomas Gleixner --- kernel/irq/handle.c | 12 +++++++++++ kernel/irq/manage.c | 60 ++++++++++++++++++++++++++++------------------------- 2 files changed, 44 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 500aaf67c546..6ff84e6a954c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -110,6 +110,18 @@ static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) * threads_oneshot untouched and runs the thread another time. */ desc->threads_oneshot |= action->thread_mask; + + /* + * We increment the threads_active counter in case we wake up + * the irq thread. The irq thread decrements the counter when + * it returns from the handler or in the exit path and wakes + * up waiters which are stuck in synchronize_irq() when the + * active count becomes zero. synchronize_irq() is serialized + * against this code (hard irq handler) via IRQS_INPROGRESS + * like the finalize_oneshot() code. See comment above. + */ + atomic_inc(&desc->threads_active); + wake_up_process(action->thread); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1786cf7dac54..453feedbb390 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -759,6 +759,13 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, return ret; } +static void wake_threads_waitq(struct irq_desc *desc) +{ + if (atomic_dec_and_test(&desc->threads_active) && + waitqueue_active(&desc->wait_for_threads)) + wake_up(&desc->wait_for_threads); +} + /* * Interrupt handler thread */ @@ -771,7 +778,6 @@ static int irq_thread(void *data) struct irq_desc *desc = irq_to_desc(action->irq); irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); - int wake; if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) @@ -783,39 +789,30 @@ static int irq_thread(void *data) current->irq_thread = 1; while (!irq_wait_for_interrupt(action)) { + irqreturn_t action_ret; irq_thread_check_affinity(desc, action); - atomic_inc(&desc->threads_active); + action_ret = handler_fn(desc, action); + if (!noirqdebug) + note_interrupt(action->irq, desc, action_ret); - raw_spin_lock_irq(&desc->lock); - if (unlikely(irqd_irq_disabled(&desc->irq_data))) { - /* - * CHECKME: We might need a dedicated - * IRQ_THREAD_PENDING flag here, which - * retriggers the thread in check_irq_resend() - * but AFAICT IRQS_PENDING should be fine as it - * retriggers the interrupt itself --- tglx - */ - desc->istate |= IRQS_PENDING; - raw_spin_unlock_irq(&desc->lock); - } else { - irqreturn_t action_ret; - - raw_spin_unlock_irq(&desc->lock); - action_ret = handler_fn(desc, action); - if (!noirqdebug) - note_interrupt(action->irq, desc, action_ret); - } - - wake = atomic_dec_and_test(&desc->threads_active); - - if (wake && waitqueue_active(&desc->wait_for_threads)) - wake_up(&desc->wait_for_threads); + wake_threads_waitq(desc); } - /* Prevent a stale desc->threads_oneshot */ - irq_finalize_oneshot(desc, action, true); + /* + * This is the regular exit path. __free_irq() is stopping the + * thread via kthread_stop() after calling + * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the + * oneshot mask bit should be set. + * + * Verify that this is true. + */ + if (WARN_ON(test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))) + wake_threads_waitq(desc); + + if (WARN_ON(desc->threads_oneshot & action->thread_mask)) + irq_finalize_oneshot(desc, action, true); /* * Clear irq_thread. Otherwise exit_irq_thread() would make @@ -845,6 +842,13 @@ void exit_irq_thread(void) desc = irq_to_desc(action->irq); + /* + * If IRQTF_RUNTHREAD is set, we need to decrement + * desc->threads_active and wake possible waiters. + */ + if (test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + wake_threads_waitq(desc); + /* Prevent a stale desc->threads_oneshot */ irq_finalize_oneshot(desc, action, true); } -- cgit v1.2.3 From 600e145882802d6ccbfe2c4aea243d97caeb91a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Mar 2012 12:35:37 +0100 Subject: printk: Make it compile with !CONFIG_PRINTK Commit 3ccf3e830615 ("printk/sched: Introduce special printk_sched() for those awkward moments") overlooked an #ifdef, so move code around to respect these directives. Reported-by: Stephen Rothwell Signed-off-by: Peter Zijlstra Cc: Randy Dunlap Link: http://lkml.kernel.org/r/1331811337.18960.179.camel@twins Signed-off-by: Ingo Molnar --- kernel/printk.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 49a2ae4a8dc7..b64ce71cb2e5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -1222,26 +1222,6 @@ int is_console_locked(void) static DEFINE_PER_CPU(int, printk_pending); static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); -int printk_sched(const char *fmt, ...) -{ - unsigned long flags; - va_list args; - char *buf; - int r; - - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); - va_end(args); - - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); - local_irq_restore(flags); - - return r; -} - void printk_tick(void) { if (__this_cpu_read(printk_pending)) { @@ -1658,6 +1638,26 @@ late_initcall(printk_late_init); #if defined CONFIG_PRINTK +int printk_sched(const char *fmt, ...) +{ + unsigned long flags; + va_list args; + char *buf; + int r; + + local_irq_save(flags); + buf = __get_cpu_var(printk_sched_buf); + + va_start(args, fmt); + r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + va_end(args); + + __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + local_irq_restore(flags); + + return r; +} + /* * printk rate limiting, lifted from the networking subsystem. * -- cgit v1.2.3 From a078c6d0e6288fad6d83fb6d5edd91ddb7b6ab33 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 15 Mar 2012 12:36:14 -0400 Subject: ntp: Fix integer overflow when setting time 'long secs' is passed as divisor to div_s64, which accepts a 32bit divisor. On 64bit machines that value is trimmed back from 8 bytes back to 4, causing a divide by zero when the number is bigger than (1 << 32) - 1 and all 32 lower bits are 0. Use div64_long() instead. Signed-off-by: Sasha Levin Cc: johnstul@us.ibm.com Link: http://lkml.kernel.org/r/1331829374-31543-2-git-send-email-levinsasha928@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 17fb1b9807d0..6e039b144daf 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -289,7 +289,7 @@ static inline s64 ntp_update_offset_fll(s64 offset64, long secs) time_status |= STA_MODE; - return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) -- cgit v1.2.3 From 79f0713d403c800db9d89134e2fd7f846e68d6ee Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 15 Mar 2012 15:17:10 -0700 Subject: prctl: use CAP_SYS_RESOURCE for PR_SET_MM option CAP_SYS_ADMIN is already overloaded left and right, so to have more fine-grained access control use CAP_SYS_RESOURCE here. The CAP_SYS_RESOUCE is chosen because this prctl option allows a current process to adjust some fields of memory map descriptor which rather represents what the process owns: pointers to code, data, stack segments, command line, auxiliary vector data and etc. Suggested-by: Michael Kerrisk Acked-by: Kees Cook Acked-by: Michael Kerrisk Cc: Pavel Emelyanov Cc: Tejun Heo Cc: Oleg Nesterov Cc: Paul Bolle Cc: KOSAKI Motohiro Signed-off-by: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 40701538fbd1..888d227fd195 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1706,7 +1706,7 @@ static int prctl_set_mm(int opt, unsigned long addr, if (arg4 | arg5) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_RESOURCE)) return -EPERM; if (addr >= TASK_SIZE) -- cgit v1.2.3 From f695cf94837de53864180400cbac42cfa370426f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 14 Mar 2012 16:38:15 -0700 Subject: time: Fix change_clocksource locking change_clocksource() fails to grab locks or call timekeeping_update(), which leaves a race window for time inconsistencies. This adds proper locking and a call to timekeeping_update() to fix this. CC: Andy Lutomirski CC: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 403c2a092830..b53da5ecbea2 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -448,9 +448,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset); static int change_clocksource(void *data) { struct clocksource *new, *old; + unsigned long flags; new = (struct clocksource *) data; + write_seqlock_irqsave(&timekeeper.lock, flags); + timekeeping_forward_now(); if (!new->enable || new->enable(new) == 0) { old = timekeeper.clock; @@ -458,6 +461,10 @@ static int change_clocksource(void *data) if (old->disable) old->disable(old); } + timekeeping_update(true); + + write_sequnlock_irqrestore(&timekeeper.lock, flags); + return 0; } -- cgit v1.2.3 From a4ca1298d8a0472a45624fa5fb99f90f0f367187 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Feb 2012 19:46:03 +0000 Subject: time: Remove bogus comments There is no global irq lock which makes a syscall magically SMP safe. Remove the outdated comment concerning do_settimeofday() as well. Signed-off-by: Thomas Gleixner Signed-off-by: John Stultz --- kernel/time.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index 73e416db0a1e..ba744cf80696 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { - /* SMP safe, global irq locking makes it work. */ sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { @@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) } } if (tv) - { - /* SMP safe, again the code in arch/foo/time.c should - * globally block out interrupts when it runs. - */ return do_settimeofday(tv); - } return 0; } -- cgit v1.2.3 From e04268b0effc0ceea366c50b3107baad9edadafa Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Mar 2012 22:55:21 +0100 Subject: genirq: Remove paranoid warnons and bogus fixups Alexander pointed out that the warnons in the regular exit path are bogus and the thread_mask one actually could be triggered when __setup_irq() hands out that thread_mask again after __free_irq() dropped irq_desc->lock. Thinking more about it, neither IRQTF_RUNTHREAD nor the bit in thread_mask can be set as this is the regular exit path. We come here due to: __free_irq() remove action from desc synchronize_irq() kthread_stop() So synchronize_irq() makes sure that the thread finished running and cleaned up both the thread_active count and thread_mask. After that point nothing can set IRQTF_RUNTHREAD on this action. So the warnons and the cleanups are pointless. Reported-by: Alexander Gordeev Cc: Ido Yariv Link: http://lkml.kernel.org/r/20120315190755.GA6732@dhcp-26-207.brq.redhat.com Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 453feedbb390..b0ccd1ac2d6a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -804,17 +804,11 @@ static int irq_thread(void *data) * This is the regular exit path. __free_irq() is stopping the * thread via kthread_stop() after calling * synchronize_irq(). So neither IRQTF_RUNTHREAD nor the - * oneshot mask bit should be set. + * oneshot mask bit can be set. We cannot verify that as we + * cannot touch the oneshot mask at this point anymore as + * __setup_irq() might have given out currents thread_mask + * again. * - * Verify that this is true. - */ - if (WARN_ON(test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags))) - wake_threads_waitq(desc); - - if (WARN_ON(desc->threads_oneshot & action->thread_mask)) - irq_finalize_oneshot(desc, action, true); - - /* * Clear irq_thread. Otherwise exit_irq_thread() would make * fuzz about an active irq thread going into nirvana. */ -- cgit v1.2.3 From d762a50b5b1bb93e91cb3cd90b6ae133da98fe31 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:38 +0800 Subject: kdb: remove the second argument of k[un]map_atomic() Signed-off-by: Cong Wang --- kernel/debug/kdb/kdb_support.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 7d6fb40d2188..d35cc2d3a4cc 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c @@ -384,9 +384,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size) if (!pfn_valid(pfn)) return 1; page = pfn_to_page(pfn); - vaddr = kmap_atomic(page, KM_KDB); + vaddr = kmap_atomic(page); memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); - kunmap_atomic(vaddr, KM_KDB); + kunmap_atomic(vaddr); return 0; } -- cgit v1.2.3 From 0de9a1e28a0d005f42c8cc5456a246710133b9ab Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 25 Nov 2011 23:14:38 +0800 Subject: power: remove the second argument of k[un]map_atomic() Acked-by: Rafael J. Wysocki Signed-off-by: Cong Wang --- kernel/power/snapshot.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 6a768e537001..3a564ac85f36 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1000,20 +1000,20 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) s_page = pfn_to_page(src_pfn); d_page = pfn_to_page(dst_pfn); if (PageHighMem(s_page)) { - src = kmap_atomic(s_page, KM_USER0); - dst = kmap_atomic(d_page, KM_USER1); + src = kmap_atomic(s_page); + dst = kmap_atomic(d_page); do_copy_page(dst, src); - kunmap_atomic(dst, KM_USER1); - kunmap_atomic(src, KM_USER0); + kunmap_atomic(dst); + kunmap_atomic(src); } else { if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(d_page, KM_USER0); + dst = kmap_atomic(d_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); } else { safe_copy_page(page_address(d_page), s_page); } @@ -1728,9 +1728,9 @@ int snapshot_read_next(struct snapshot_handle *handle) */ void *kaddr; - kaddr = kmap_atomic(page, KM_USER0); + kaddr = kmap_atomic(page); copy_page(buffer, kaddr); - kunmap_atomic(kaddr, KM_USER0); + kunmap_atomic(kaddr); handle->buffer = buffer; } else { handle->buffer = page_address(page); @@ -2014,9 +2014,9 @@ static void copy_last_highmem_page(void) if (last_highmem_page) { void *dst; - dst = kmap_atomic(last_highmem_page, KM_USER0); + dst = kmap_atomic(last_highmem_page); copy_page(dst, buffer); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst); last_highmem_page = NULL; } } @@ -2309,13 +2309,13 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf) { void *kaddr1, *kaddr2; - kaddr1 = kmap_atomic(p1, KM_USER0); - kaddr2 = kmap_atomic(p2, KM_USER1); + kaddr1 = kmap_atomic(p1); + kaddr2 = kmap_atomic(p2); copy_page(buf, kaddr1); copy_page(kaddr1, kaddr2); copy_page(kaddr2, buf); - kunmap_atomic(kaddr2, KM_USER1); - kunmap_atomic(kaddr1, KM_USER0); + kunmap_atomic(kaddr2); + kunmap_atomic(kaddr1); } /** -- cgit v1.2.3 From 5f8aadd8b9966d71a77bba52b9d499cc2f38269f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 14 Mar 2012 19:55:38 +0100 Subject: CLONE_PARENT shouldn't allow to set ->exit_signal The child must not control its ->exit_signal, it is the parent who decides which signal the child should use for notification. This means that CLONE_PARENT should not use "clone_flags & CSIGNAL", the forking task is the sibling of the new process and their parent doesn't control exit_signal in this case. This patch uses ->exit_signal of the forking process, but perhaps we should simply use SIGCHLD. We read group_leader->exit_signal lockless, this can race with the ORIGINAL_SIGNAL -> SIGCHLD transition, but this is fine. Potentially this change allows to kill self_exec_id/parent_exec_id. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/fork.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 26a7a6707fa7..c4f38a849436 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1340,7 +1340,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, clear_all_latency_tracing(p); /* ok, now we should be set up.. */ - p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else if (clone_flags & CLONE_PARENT) + p->exit_signal = current->group_leader->exit_signal; + else + p->exit_signal = (clone_flags & CSIGNAL); + p->pdeath_signal = 0; p->exit_state = 0; -- cgit v1.2.3 From e636825346b36a07ccfc8e30946d52855e21f681 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:22 +0100 Subject: exit_signal: simplify the "we have changed execution domain" logic exit_notify() checks "tsk->self_exec_id != tsk->parent_exec_id" to handle the "we have changed execution domain" case. We can change do_thread() to always set ->exit_signal = SIGCHLD and remove this check to simplify the code. We could change setup_new_exec() instead, this looks more logical because it increments ->self_exec_id. But note that de_thread() already resets ->exit_signal if it changes the leader, let's keep both changes close to each other. Note that we change ->exit_signal lockless, this changes the rules. Thereafter ->exit_signal is not stable under tasklist but this is fine, the only possible change is OLDSIG -> SIGCHLD. This can race with eligible_child() but the race is harmless. We can race with reparent_leader() which changes our ->exit_signal in parallel, but it does the same change to SIGCHLD. The noticeable user-visible change is that the execing task is not "visible" to do_wait()->eligible_child(__WCLONE) right after exec. To me this looks more logical, and this is consistent with mt case. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/exec.c | 3 +++ kernel/exit.c | 7 +------ 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index b0695a9900ef..1e94d2263ae0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -977,6 +977,9 @@ static int de_thread(struct task_struct *tsk) sig->notify_count = 0; no_thread_group: + /* we have changed execution domain */ + tsk->exit_signal = SIGCHLD; + if (current->mm) setmax_mm_hiwater_rss(&sig->maxrss, current->mm); diff --git a/kernel/exit.c b/kernel/exit.c index 752d2c0abd19..51ac4ced1313 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -827,14 +827,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) * If the parent exec id doesn't match the exec id we saved * when we started then we know the parent has changed security * domain. - * - * If our self_exec id doesn't match our parent_exec_id then - * we have changed execution domain as these two values started - * the same after a fork. */ if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id)) + tsk->parent_exec_id != tsk->real_parent->self_exec_id) tsk->exit_signal = SIGCHLD; if (unlikely(tsk->ptrace)) { -- cgit v1.2.3 From b6e238dceed36891cc633167afe7151f1f3d83c5 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 19 Mar 2012 17:03:41 +0100 Subject: exit_signal: fix the "parent has changed security domain" logic exit_notify() changes ->exit_signal if the parent already did exec. This doesn't really work, we are not going to send the signal now if there is another live thread or the exiting task is traced. The parent can exec before the last dies or the tracer detaches. Move this check into do_notify_parent() which actually sends the signal. The user-visible change is that we do not change ->exit_signal, and thus the exiting task is still "clone children" for do_wait()->eligible_child(__WCLONE). Hopefully this is fine, the current logic is racy anyway. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 14 -------------- kernel/signal.c | 9 +++++++++ 2 files changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 51ac4ced1313..ce5f758f40bd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -818,20 +818,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); - /* Let father know we died - * - * Thread signals are configurable, but you aren't going to use - * that to send signals to arbitrary processes. - * That stops right now. - * - * If the parent exec id doesn't match the exec id we saved - * when we started then we know the parent has changed security - * domain. - */ - if (thread_group_leader(tsk) && tsk->exit_signal != SIGCHLD && - tsk->parent_exec_id != tsk->real_parent->self_exec_id) - tsk->exit_signal = SIGCHLD; - if (unlikely(tsk->ptrace)) { int sig = thread_group_leader(tsk) && thread_group_empty(tsk) && diff --git a/kernel/signal.c b/kernel/signal.c index 8511e39813c7..e76001ccf5cd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1652,6 +1652,15 @@ bool do_notify_parent(struct task_struct *tsk, int sig) BUG_ON(!tsk->ptrace && (tsk->group_leader != tsk || !thread_group_empty(tsk))); + if (sig != SIGCHLD) { + /* + * This is only possible if parent == real_parent. + * Check if it has changed security domain. + */ + if (tsk->parent_exec_id != tsk->parent->self_exec_id) + sig = SIGCHLD; + } + info.si_signo = sig; info.si_errno = 0; /* -- cgit v1.2.3 From 48fde701aff662559b38d9a609574068f22d00fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 8 Jan 2012 22:15:13 -0500 Subject: switch open-coded instances of d_make_root() to new helper Signed-off-by: Al Viro --- arch/powerpc/platforms/cell/spufs/inode.c | 4 ++-- arch/s390/hypfs/inode.c | 6 ++---- drivers/misc/ibmasm/ibmasmfs.c | 6 ++---- drivers/oprofile/oprofilefs.c | 6 ++---- drivers/usb/core/inode.c | 9 +-------- drivers/usb/gadget/f_fs.c | 8 ++------ drivers/usb/gadget/inode.c | 4 +--- fs/9p/vfs_super.c | 3 +-- fs/adfs/super.c | 3 +-- fs/affs/super.c | 7 ++----- fs/afs/super.c | 7 ++----- fs/autofs4/inode.c | 10 ++-------- fs/befs/linuxvfs.c | 3 +-- fs/bfs/inode.c | 3 +-- fs/btrfs/super.c | 8 ++------ fs/ceph/super.c | 3 +-- fs/cifs/cifsfs.c | 4 +--- fs/coda/inode.c | 3 +-- fs/configfs/mount.c | 3 +-- fs/cramfs/inode.c | 6 ++---- fs/devpts/inode.c | 3 +-- fs/ecryptfs/main.c | 3 +-- fs/efs/super.c | 3 +-- fs/exofs/super.c | 3 +-- fs/ext2/super.c | 3 +-- fs/ext3/super.c | 3 +-- fs/ext4/super.c | 3 +-- fs/freevxfs/vxfs_super.c | 3 +-- fs/fuse/inode.c | 9 ++------- fs/gfs2/ops_fstype.c | 3 +-- fs/hfs/super.c | 6 ++---- fs/hostfs/hostfs_kern.c | 4 ++-- fs/hpfs/super.c | 6 ++---- fs/hppfs/hppfs.c | 9 ++------- fs/hugetlbfs/inode.c | 13 ++----------- fs/isofs/inode.c | 3 +-- fs/jffs2/fs.c | 6 ++---- fs/jfs/super.c | 3 +-- fs/libfs.c | 6 ++---- fs/logfs/super.c | 6 ++---- fs/ncpfs/inode.c | 6 ++---- fs/nfs/getroot.c | 6 ++---- fs/nilfs2/super.c | 3 +-- fs/ocfs2/dlmfs/dlmfs.c | 14 ++------------ fs/ocfs2/super.c | 3 +-- fs/omfs/inode.c | 6 ++---- fs/openpromfs/inode.c | 3 +-- fs/proc/inode.c | 15 +++------------ fs/pstore/inode.c | 3 +-- fs/qnx4/inode.c | 6 ++---- fs/ramfs/inode.c | 12 ++---------- fs/reiserfs/super.c | 6 ++---- fs/romfs/super.c | 6 ++---- fs/squashfs/super.c | 3 +-- fs/sysfs/mount.c | 3 +-- fs/sysv/super.c | 3 +-- fs/ubifs/super.c | 6 ++---- fs/udf/super.c | 3 +-- fs/ufs/super.c | 6 ++---- fs/xfs/xfs_super.c | 6 ++---- ipc/mqueue.c | 24 +++++++----------------- kernel/cgroup.c | 8 ++------ mm/shmem.c | 6 ++---- net/sunrpc/rpc_pipe.c | 8 ++------ 64 files changed, 105 insertions(+), 264 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index d4a094ca96f3..17b3211e3641 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -757,9 +757,9 @@ spufs_create_root(struct super_block *sb, void *data) goto out_iput; ret = -ENOMEM; - sb->s_root = d_alloc_root(inode); + sb->s_root = d_make_root(inode); if (!sb->s_root) - goto out_iput; + goto out; return 0; out_iput: diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 8a2a887478cc..6a2cb560e968 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -293,11 +293,9 @@ static int hypfs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; root_inode->i_op = &simple_dir_inode_operations; root_inode->i_fop = &simple_dir_operations; - sb->s_root = root_dentry = d_alloc_root(root_inode); - if (!root_dentry) { - iput(root_inode); + sb->s_root = root_dentry = d_make_root(root_inode); + if (!root_dentry) return -ENOMEM; - } if (MACHINE_IS_VM) rc = hypfs_vm_create_files(sb, root_dentry); else diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index 35361753b487..15f24f362208 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -129,11 +129,9 @@ static int ibmasmfs_fill_super (struct super_block *sb, void *data, int silent) root->i_op = &simple_dir_inode_operations; root->i_fop = ibmasmfs_dir_ops; - root_dentry = d_alloc_root(root); - if (!root_dentry) { - iput(root); + root_dentry = d_make_root(root); + if (!root_dentry) return -ENOMEM; - } sb->s_root = root_dentry; ibmasmfs_create_files(sb, root_dentry); diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index 2f0aa0f700e6..277bb70b8d75 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -251,11 +251,9 @@ static int oprofilefs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; root_inode->i_op = &simple_dir_inode_operations; root_inode->i_fop = &simple_dir_operations; - root_dentry = d_alloc_root(root_inode); - if (!root_dentry) { - iput(root_inode); + root_dentry = d_make_root(root_inode); + if (!root_dentry) return -ENOMEM; - } sb->s_root = root_dentry; diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index 9e186f3da839..bdaef8e36020 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -462,16 +462,9 @@ static int usbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &usbfs_ops; sb->s_time_gran = 1; inode = usbfs_get_inode(sb, S_IFDIR | 0755, 0); - - if (!inode) { - dbg("%s: could not get inode!",__func__); - return -ENOMEM; - } - - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { dbg("%s: could not get root dentry!",__func__); - iput(inode); return -ENOMEM; } sb->s_root = root; diff --git a/drivers/usb/gadget/f_fs.c b/drivers/usb/gadget/f_fs.c index f63dc6c150d2..d825b248728a 100644 --- a/drivers/usb/gadget/f_fs.c +++ b/drivers/usb/gadget/f_fs.c @@ -1063,13 +1063,9 @@ static int ffs_sb_fill(struct super_block *sb, void *_data, int silent) &simple_dir_operations, &simple_dir_inode_operations, &data->perms); - if (unlikely(!inode)) + sb->s_root = d_make_root(inode); + if (unlikely(!sb->s_root)) goto Enomem; - sb->s_root = d_alloc_root(inode); - if (unlikely(!sb->s_root)) { - iput(inode); - goto Enomem; - } /* EP0 file */ if (unlikely(!ffs_sb_create_file(sb, "ep0", ffs, diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c index ae04266dba1b..c95eea43b637 100644 --- a/drivers/usb/gadget/inode.c +++ b/drivers/usb/gadget/inode.c @@ -2059,10 +2059,8 @@ gadgetfs_fill_super (struct super_block *sb, void *opts, int silent) if (!inode) goto Enomem; inode->i_op = &simple_dir_inode_operations; - if (!(sb->s_root = d_alloc_root (inode))) { - iput(inode); + if (!(sb->s_root = d_make_root (inode))) goto Enomem; - } /* the ep0 file is named after the controller we expect; * user mode code can use it for sanity checks, like we do. diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 7b0cd87b07c2..10b7d3c9dba8 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -155,9 +155,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { - iput(inode); retval = -ENOMEM; goto release_sb; } diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 8e3b36ace305..06fdcc9382c4 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -483,10 +483,9 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &adfs_dentry_operations; root = adfs_iget(sb, &root_obj); - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { int i; - iput(root); for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); diff --git a/fs/affs/super.c b/fs/affs/super.c index 8ba73fed7964..0782653a05a2 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -473,7 +473,7 @@ got_root: root_inode = affs_iget(sb, root_block); if (IS_ERR(root_inode)) { ret = PTR_ERR(root_inode); - goto out_error_noinode; + goto out_error; } if (AFFS_SB(sb)->s_flags & SF_INTL) @@ -481,7 +481,7 @@ got_root: else sb->s_d_op = &affs_dentry_operations; - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk(KERN_ERR "AFFS: Get root inode failed\n"); goto out_error; @@ -494,9 +494,6 @@ got_root: * Begin the cascaded cleanup ... */ out_error: - if (root_inode) - iput(root_inode); -out_error_noinode: kfree(sbi->s_bitmap); affs_brelse(root_bh); kfree(sbi->s_prefix); diff --git a/fs/afs/super.c b/fs/afs/super.c index 983ec59fc80d..f02b31e7e648 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -301,7 +301,6 @@ static int afs_fill_super(struct super_block *sb, { struct afs_super_info *as = sb->s_fs_info; struct afs_fid fid; - struct dentry *root = NULL; struct inode *inode = NULL; int ret; @@ -327,18 +326,16 @@ static int afs_fill_super(struct super_block *sb, set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); ret = -ENOMEM; - root = d_alloc_root(inode); - if (!root) + sb->s_root = d_make_root(inode); + if (!sb->s_root) goto error; sb->s_d_op = &afs_fs_dentry_operations; - sb->s_root = root; _leave(" = 0"); return 0; error: - iput(inode); _leave(" = %d", ret); return ret; } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 06858d955120..d8dc002e9cc3 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -247,12 +247,9 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (!ino) goto fail_free; root_inode = autofs4_get_inode(s, S_IFDIR | 0755); - if (!root_inode) - goto fail_ino; - - root = d_alloc_root(root_inode); + root = d_make_root(root_inode); if (!root) - goto fail_iput; + goto fail_ino; pipe = NULL; root->d_fsdata = ino; @@ -317,9 +314,6 @@ fail_fput: fail_dput: dput(root); goto fail_free; -fail_iput: - printk("autofs: get root dentry failed\n"); - iput(root_inode); fail_ino: kfree(ino); fail_free: diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 6e6d536767fe..e18da23d42b5 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -852,9 +852,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto unacquire_priv_sbp; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); befs_error(sb, "get root inode failed"); goto unacquire_priv_sbp; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index b0391bc402b1..e23dc7c8b884 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -367,9 +367,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) ret = PTR_ERR(inode); goto out2; } - s->s_root = d_alloc_root(inode); + s->s_root = d_make_root(inode); if (!s->s_root) { - iput(inode); ret = -ENOMEM; goto out2; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3ce97b217cbe..81df3fec6a6d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -629,7 +629,6 @@ static int btrfs_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; - struct dentry *root_dentry; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_key key; int err; @@ -660,15 +659,12 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - root_dentry = d_alloc_root(inode); - if (!root_dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) { err = -ENOMEM; goto fail_close; } - sb->s_root = root_dentry; - save_mount_options(sb, data); cleancache_init_fs(sb); sb->s_flags |= MS_ACTIVE; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 00de2c9568cd..256f85221926 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -655,9 +655,8 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, dout("open_root_inode success\n"); if (ceph_ino(inode) == CEPH_INO_ROOT && fsc->sb->s_root == NULL) { - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { - iput(inode); root = ERR_PTR(-ENOMEM); goto out; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8b7d7ff88792..418fc42fb8b2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -122,11 +122,9 @@ cifs_read_super(struct super_block *sb) goto out_no_root; } - sb->s_root = d_alloc_root(inode); - + sb->s_root = d_make_root(inode); if (!sb->s_root) { rc = -ENOMEM; - iput(inode); goto out_no_root; } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 32dafc875c14..05156c17b551 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -213,9 +213,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) printk("coda_read_super: rootinode is %ld dev %s\n", root->i_ino, root->i_sb->s_id); - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); error = -EINVAL; goto error; } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 276e15cafd58..07f60455f1c1 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -91,10 +91,9 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; } - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n",__func__); - iput(inode); return -ENOMEM; } config_group_init(&configfs_root_group); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a2ee8f9f5a38..853480d2b3d1 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -318,11 +318,9 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) root = get_cramfs_inode(sb, &super.root, 0); if (IS_ERR(root)) goto out; - sb->s_root = d_alloc_root(root); - if (!sb->s_root) { - iput(root); + sb->s_root = d_make_root(root); + if (!sb->s_root) goto out; - } return 0; out: kfree(sbi); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c4e2a58a2e82..57dae0baedf2 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -309,12 +309,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent) inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); - s->s_root = d_alloc_root(inode); + s->s_root = d_make_root(inode); if (s->s_root) return 0; printk(KERN_ERR "devpts: get root dentry failed\n"); - iput(inode); fail: return -ENOMEM; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index b4a6befb1216..6e0e017e6932 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -550,9 +550,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags if (IS_ERR(inode)) goto out_free; - s->s_root = d_alloc_root(inode); + s->s_root = d_make_root(inode); if (!s->s_root) { - iput(inode); rc = -ENOMEM; goto out_free; } diff --git a/fs/efs/super.c b/fs/efs/super.c index 981106429a9f..e755ec746c69 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -317,10 +317,9 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) goto out_no_fs; } - s->s_root = d_alloc_root(root); + s->s_root = d_make_root(root); if (!(s->s_root)) { printk(KERN_ERR "EFS: get root dentry failed\n"); - iput(root); ret = -ENOMEM; goto out_no_fs; } diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 6cafcadfc3c8..7f2b590a36b7 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -819,9 +819,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(root); goto free_sbi; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); EXOFS_ERR("ERROR: get root inode failed\n"); ret = -ENOMEM; goto free_sbi; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9f6766a3ac1e..e1025c7a437a 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1088,9 +1088,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount3; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); ext2_msg(sb, KERN_ERR, "error: get root inode failed"); ret = -ENOMEM; goto failed_mount3; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 726c7ef6cdf1..e0b45b93327b 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2046,10 +2046,9 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) ext3_msg(sb, KERN_ERR, "error: corrupt root inode, run e2fsck"); goto failed_mount3; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { ext3_msg(sb, KERN_ERR, "error: get root dentry failed"); - iput(root); ret = -ENOMEM; goto failed_mount3; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 502c61fd7392..d2baea7bcf30 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3735,9 +3735,8 @@ no_journal: iput(root); goto failed_mount4; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { - iput(root); ext4_msg(sb, KERN_ERR, "get root dentry failed"); ret = -ENOMEM; goto failed_mount4; diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 9d1c99558389..d4fabd26084e 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -224,9 +224,8 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) ret = PTR_ERR(root); goto out; } - sbp->s_root = d_alloc_root(root); + sbp->s_root = d_make_root(root); if (!sbp->s_root) { - iput(root); printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); goto out_free_ilist; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 64cf8d07393e..4aec5995867e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -988,14 +988,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err = -ENOMEM; root = fuse_get_root_inode(sb, d.rootmode); - if (!root) + root_dentry = d_make_root(root); + if (!root_dentry) goto err_put_conn; - - root_dentry = d_alloc_root(root); - if (!root_dentry) { - iput(root); - goto err_put_conn; - } /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 24f609c9ef91..10e848c6d1b5 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -431,10 +431,9 @@ static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); return PTR_ERR(inode); } - dentry = d_alloc_root(inode); + dentry = d_make_root(inode); if (!dentry) { fs_err(sdp, "can't alloc %s dentry\n", name); - iput(inode); return -ENOMEM; } *dptr = dentry; diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 8137fb3e6780..7b4c537d6e13 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -430,15 +430,13 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_d_op = &hfs_dentry_operations; res = -ENOMEM; - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) - goto bail_iput; + goto bail_no_root; /* everything's okay */ return 0; -bail_iput: - iput(root_inode); bail_no_root: printk(KERN_ERR "hfs: get root inode failed.\n"); bail: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e130bd46d671..588d45885a6f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -966,9 +966,9 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) } err = -ENOMEM; - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) - goto out_put; + goto out; return 0; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 3690467c944e..54f6eccb79d9 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -625,11 +625,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) hpfs_init_inode(root); hpfs_read_inode(root); unlock_new_inode(root); - s->s_root = d_alloc_root(root); - if (!s->s_root) { - iput(root); + s->s_root = d_make_root(root); + if (!s->s_root) goto bail0; - } /* * find the root directory's . pointer & finish filling in the inode diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index d92f4ce80925..a80e45a690ac 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -726,17 +726,12 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent) err = -ENOMEM; root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); - if (!root_inode) - goto out_mntput; - - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) - goto out_iput; + goto out_mntput; return 0; - out_iput: - iput(root_inode); out_mntput: mntput(proc_mnt); out: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e85a7ac0217..81932fa1861a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -831,8 +831,6 @@ bad_val: static int hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) { - struct inode * inode; - struct dentry * root; int ret; struct hugetlbfs_config config; struct hugetlbfs_sb_info *sbinfo; @@ -865,16 +863,9 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = HUGETLBFS_MAGIC; sb->s_op = &hugetlbfs_ops; sb->s_time_gran = 1; - inode = hugetlbfs_get_root(sb, &config); - if (!inode) - goto out_free; - - root = d_alloc_root(inode); - if (!root) { - iput(inode); + sb->s_root = d_make_root(hugetlbfs_get_root(sb, &config)); + if (!sb->s_root) goto out_free; - } - sb->s_root = root; return 0; out_free: kfree(sbinfo); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index bd62c76fb5df..29037c365ba4 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -947,9 +947,8 @@ root_found: s->s_d_op = &isofs_dentry_ops[table]; /* get the root dentry */ - s->s_root = d_alloc_root(inode); + s->s_root = d_make_root(inode); if (!(s->s_root)) { - iput(inode); error = -ENOMEM; goto out_no_inode; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2e0123867cb1..c0d5c9d770da 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -561,9 +561,9 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; D1(printk(KERN_DEBUG "jffs2_do_fill_super(): d_alloc_root()\n")); - sb->s_root = d_alloc_root(root_i); + sb->s_root = d_make_root(root_i); if (!sb->s_root) - goto out_root_i; + goto out_root; sb->s_maxbytes = 0xFFFFFFFF; sb->s_blocksize = PAGE_CACHE_SIZE; @@ -573,8 +573,6 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) jffs2_start_garbage_collect_thread(c); return 0; - out_root_i: - iput(root_i); out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 4661ad705130..b3bb95504479 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -522,7 +522,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = PTR_ERR(inode); goto out_no_rw; } - sb->s_root = d_alloc_root(inode); + sb->s_root = d_make_root(inode); if (!sb->s_root) goto out_no_root; @@ -540,7 +540,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) out_no_root: jfs_err("jfs_read_super: get root dentry failed"); - iput(inode); out_no_rw: rc = jfs_umount(sb); diff --git a/fs/libfs.c b/fs/libfs.c index 5b2dbb3ba4fc..7c895a763a1e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -491,11 +491,9 @@ int simple_fill_super(struct super_block *s, unsigned long magic, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); - root = d_alloc_root(inode); - if (!root) { - iput(inode); + root = d_make_root(inode); + if (!root) return -ENOMEM; - } for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) continue; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index b1a491a5fe78..7de18c3021fe 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -315,11 +315,9 @@ static int logfs_get_sb_final(struct super_block *sb) if (IS_ERR(rootdir)) goto fail; - sb->s_root = d_alloc_root(rootdir); - if (!sb->s_root) { - iput(rootdir); + sb->s_root = d_make_root(rootdir); + if (!sb->s_root) goto fail; - } /* at that point we know that ->put_super() will be called */ super->s_erase_page = alloc_pages(GFP_KERNEL, 0); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 3d1e34f8a68e..49df0e7f8379 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -716,13 +716,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (!root_inode) goto out_disconnect; DPRINTK("ncp_fill_super: root vol=%d\n", NCP_FINFO(root_inode)->volNumber); - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) - goto out_no_root; + goto out_disconnect; return 0; -out_no_root: - iput(root_inode); out_disconnect: ncp_lock_server(server); ncp_disconnect(server); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index dcb61548887f..801d6d830787 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -49,11 +49,9 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i { /* The mntroot acts as the dummy root dentry for this superblock */ if (sb->s_root == NULL) { - sb->s_root = d_alloc_root(inode); - if (sb->s_root == NULL) { - iput(inode); + sb->s_root = d_make_root(inode); + if (sb->s_root == NULL) return -ENOMEM; - } ihold(inode); /* * Ensure that this dentry is invisible to d_find_alias(). diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1fc9ad3c1d14..1099a76cee59 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -917,9 +917,8 @@ static int nilfs_get_root_dentry(struct super_block *sb, if (root->cno == NILFS_CPTREE_CURRENT_CNO) { dentry = d_find_alias(inode); if (!dentry) { - dentry = d_alloc_root(inode); + dentry = d_make_root(inode); if (!dentry) { - iput(inode); ret = -ENOMEM; goto failed_dentry; } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index abfac0d7ae9c..3b5825ef3193 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -582,24 +582,14 @@ static int dlmfs_fill_super(struct super_block * sb, void * data, int silent) { - struct inode * inode; - struct dentry * root; - sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = DLMFS_MAGIC; sb->s_op = &dlmfs_ops; - inode = dlmfs_get_root_inode(sb); - if (!inode) - return -ENOMEM; - - root = d_alloc_root(inode); - if (!root) { - iput(inode); + sb->s_root = d_make_root(dlmfs_get_root_inode(sb)); + if (!sb->s_root) return -ENOMEM; - } - sb->s_root = root; return 0; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2b1184f7097f..337687c3e233 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1166,9 +1166,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) goto read_super_error; } - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { - iput(inode); status = -ENOMEM; mlog_errno(status); goto read_super_error; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 6065bb0ba207..dbc842222589 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -539,11 +539,9 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) goto out_brelse_bh2; } - sb->s_root = d_alloc_root(root); - if (!sb->s_root) { - iput(root); + sb->s_root = d_make_root(root); + if (!sb->s_root) goto out_brelse_bh2; - } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index a88c03bc749d..bc49c975d501 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -408,13 +408,12 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent) oi->type = op_inode_node; oi->u.node = of_find_node_by_path("/"); - s->s_root = d_alloc_root(root_inode); + s->s_root = d_make_root(root_inode); if (!s->s_root) goto out_no_root_dentry; return 0; out_no_root_dentry: - iput(root_inode); ret = -ENOMEM; out_no_root: printk("openprom_fill_super: get root inode failed\n"); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index a70af3a44f45..8461a7b82fdb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -486,8 +486,6 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) int proc_fill_super(struct super_block *s) { - struct inode * root_inode; - s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; @@ -496,17 +494,10 @@ int proc_fill_super(struct super_block *s) s->s_time_gran = 1; pde_get(&proc_root); - root_inode = proc_get_inode(s, &proc_root); - if (!root_inode) - goto out_no_root; - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) { - iput(root_inode); - goto out_no_root; - } - return 0; + s->s_root = d_make_root(proc_get_inode(s, &proc_root)); + if (s->s_root) + return 0; -out_no_root: printk("proc_read_super: get root inode failed\n"); pde_put(&proc_root); return -ENOMEM; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index b3b426edb2fd..ec7d1fb6f35a 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -303,7 +303,7 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) /* override ramfs "dir" options so we catch unlink(2) */ inode->i_op = &pstore_dir_inode_operations; - root = d_alloc_root(inode); + root = d_make_root(inode); sb->s_root = root; if (!root) { err = -ENOMEM; @@ -314,7 +314,6 @@ int pstore_fill_super(struct super_block *sb, void *data, int silent) return 0; fail: - iput(inode); return err; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 6b009548d2e0..db18d866d981 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -260,15 +260,13 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) } ret = -ENOMEM; - s->s_root = d_alloc_root(root); + s->s_root = d_make_root(root); if (s->s_root == NULL) - goto outi; + goto outb; brelse(bh); return 0; - outi: - iput(root); outb: kfree(qs->BitMap); out: diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index aec766abe3af..b6612d2ed718 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -210,7 +210,6 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) { struct ramfs_fs_info *fsi; struct inode *inode = NULL; - struct dentry *root; int err; save_mount_options(sb, data); @@ -234,14 +233,8 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; inode = ramfs_get_inode(sb, NULL, S_IFDIR | fsi->mount_opts.mode, 0); - if (!inode) { - err = -ENOMEM; - goto fail; - } - - root = d_alloc_root(inode); - sb->s_root = root; - if (!root) { + sb->s_root = d_make_root(inode); + if (!sb->s_root) { err = -ENOMEM; goto fail; } @@ -250,7 +243,6 @@ int ramfs_fill_super(struct super_block *sb, void *data, int silent) fail: kfree(fsi); sb->s_fs_info = NULL; - iput(inode); return err; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index e12d8b97cd4d..208dfd144409 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1874,11 +1874,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) unlock_new_inode(root_inode); } - s->s_root = d_alloc_root(root_inode); - if (!s->s_root) { - iput(root_inode); + s->s_root = d_make_root(root_inode); + if (!s->s_root) goto error; - } // define and initialize hash function sbi->s_hash_function = hash_function(s); if (sbi->s_hash_function == NULL) { diff --git a/fs/romfs/super.c b/fs/romfs/super.c index bb36ab74eb45..e64f6b5f7ae5 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -538,14 +538,12 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(root)) goto error; - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) - goto error_i; + goto error; return 0; -error_i: - iput(root); error: return -EINVAL; error_rsb_inval: diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index ecaa2f7bdb8f..970b1167e7cb 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -316,11 +316,10 @@ check_directory_table: } insert_inode_hash(root); - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (sb->s_root == NULL) { ERROR("Root inode create failed\n"); err = -ENOMEM; - iput(root); goto failed_mount; } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index e34f0d99ea4e..2243f8ec64d5 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -61,10 +61,9 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) } /* instantiate and link root dentry */ - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) { pr_debug("%s: could not get root dentry!\n",__func__); - iput(inode); return -ENOMEM; } root->d_fsdata = &sysfs_root; diff --git a/fs/sysv/super.c b/fs/sysv/super.c index f467740e088c..7491c33b6468 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -341,9 +341,8 @@ static int complete_read_super(struct super_block *sb, int silent, int size) printk("SysV FS: get root inode failed\n"); return 0; } - sb->s_root = d_alloc_root(root_inode); + sb->s_root = d_make_root(root_inode); if (!sb->s_root) { - iput(root_inode); printk("SysV FS: get root dentry failed\n"); return 0; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 63765d58445b..76e4e0566ad6 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2076,15 +2076,13 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out_umount; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) - goto out_iput; + goto out_umount; mutex_unlock(&c->umount_mutex); return 0; -out_iput: - iput(root); out_umount: ubifs_umount(c); out_unlock: diff --git a/fs/udf/super.c b/fs/udf/super.c index 8d8b25336fbb..85067b4c7e14 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -2037,10 +2037,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) } /* Allocate a dentry for the root inode */ - sb->s_root = d_alloc_root(inode); + sb->s_root = d_make_root(inode); if (!sb->s_root) { udf_err(sb, "Couldn't allocate root dentry\n"); - iput(inode); goto error_out; } sb->s_maxbytes = MAX_LFS_FILESIZE; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index ec25d09fcaa8..f636f6b460d0 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1164,10 +1164,10 @@ magic_found: ret = PTR_ERR(inode); goto failed; } - sb->s_root = d_alloc_root(inode); + sb->s_root = d_make_root(inode); if (!sb->s_root) { ret = -ENOMEM; - goto dalloc_failed; + goto failed; } ufs_setup_cstotal(sb); @@ -1181,8 +1181,6 @@ magic_found: UFSD("EXIT\n"); return 0; -dalloc_failed: - iput(inode); failed: if (ubh) ubh_brelse_uspi (uspi); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0e4c5c017fba..baf40e378d35 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1362,10 +1362,10 @@ xfs_fs_fill_super( error = EINVAL; goto out_syncd_stop; } - sb->s_root = d_alloc_root(root); + sb->s_root = d_make_root(root); if (!sb->s_root) { error = ENOMEM; - goto out_iput; + goto out_syncd_stop; } return 0; @@ -1384,8 +1384,6 @@ xfs_fs_fill_super( out: return -error; - out_iput: - iput(root); out_syncd_stop: xfs_syncd_stop(mp); out_unmount: diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 86ee272de210..28bd64ddeda3 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -188,30 +188,20 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct ipc_namespace *ns = data; - int error; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = MQUEUE_MAGIC; sb->s_op = &mqueue_super_ops; - inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, - NULL); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - goto out; - } + inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL); + if (IS_ERR(inode)) + return PTR_ERR(inode); - sb->s_root = d_alloc_root(inode); - if (!sb->s_root) { - iput(inode); - error = -ENOMEM; - goto out; - } - error = 0; - -out: - return error; + sb->s_root = d_make_root(inode); + if (!sb->s_root) + return -ENOMEM; + return 0; } static struct dentry *mqueue_mount(struct file_system_type *fs_type, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a5d3b5325f77..711c1a30ceaa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1472,7 +1472,6 @@ static int cgroup_get_rootdir(struct super_block *sb) struct inode *inode = cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); - struct dentry *dentry; if (!inode) return -ENOMEM; @@ -1481,12 +1480,9 @@ static int cgroup_get_rootdir(struct super_block *sb) inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - dentry = d_alloc_root(inode); - if (!dentry) { - iput(inode); + sb->s_root = d_make_root(inode); + if (!sb->s_root) return -ENOMEM; - } - sb->s_root = dentry; /* for everything else we want ->d_op set */ sb->s_d_op = &cgroup_dops; return 0; diff --git a/mm/shmem.c b/mm/shmem.c index 269d049294ab..154243f0a27c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2232,14 +2232,12 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) goto failed; inode->i_uid = sbinfo->uid; inode->i_gid = sbinfo->gid; - root = d_alloc_root(inode); + root = d_make_root(inode); if (!root) - goto failed_iput; + goto failed; sb->s_root = root; return 0; -failed_iput: - iput(inode); failed: shmem_put_super(sb); return err; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 63a7a7add21e..7d6dd6efbdbe 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1033,13 +1033,9 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; inode = rpc_get_inode(sb, S_IFDIR | 0755); - if (!inode) - return -ENOMEM; - sb->s_root = root = d_alloc_root(inode); - if (!root) { - iput(inode); + sb->s_root = root = d_make_root(inode); + if (!root) return -ENOMEM; - } if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; return 0; -- cgit v1.2.3 From 66b3fad3f4c535c92b6a1184d535a97d6aa5d82a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Mar 2012 21:48:20 -0400 Subject: constify path argument of audit_log_d_path() Signed-off-by: Al Viro --- include/linux/audit.h | 2 +- kernel/audit.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index 9ff7a2c48b50..ed3ef1972496 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -684,7 +684,7 @@ extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path); + const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_lost(const char *message); diff --git a/kernel/audit.c b/kernel/audit.c index bb0eb5bb9a0a..1c7f2c61416b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1418,7 +1418,7 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) /* This is a helper-function to print the escaped d_path */ void audit_log_d_path(struct audit_buffer *ab, const char *prefix, - struct path *path) + const struct path *path) { char *p, *pathname; -- cgit v1.2.3 From 38eff2892628fa5c4fc8962a17b7296f42833ebe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Mar 2012 21:51:10 -0400 Subject: constify path argument of trace_seq_path() Signed-off-by: Al Viro --- include/linux/trace_seq.h | 4 ++-- kernel/trace/trace_output.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 7dadc3df0c77..a32d86ec8bf2 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -44,7 +44,7 @@ extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len); extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); +extern int trace_seq_path(struct trace_seq *s, const struct path *path); #else /* CONFIG_TRACING */ static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) @@ -88,7 +88,7 @@ static inline void *trace_seq_reserve(struct trace_seq *s, size_t len) { return NULL; } -static inline int trace_seq_path(struct trace_seq *s, struct path *path) +static inline int trace_seq_path(struct trace_seq *s, const struct path *path) { return 0; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0d6ff3555942..690987198ad7 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -264,7 +264,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len) return ret; } -int trace_seq_path(struct trace_seq *s, struct path *path) +int trace_seq_path(struct trace_seq *s, const struct path *path) { unsigned char *p; -- cgit v1.2.3 From c3f0327f8e9d7a503f0d64573c311eddd61f197d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 21 Mar 2012 16:33:48 -0700 Subject: mm: add rss counters consistency check Warn about non-zero rss counters at final mmdrop. This check will prevent reoccurences of bugs such as that fixed in "mm: fix rss count leakage during migration". I didn't hide this check under CONFIG_VM_DEBUG because it rather small and rss counters cover whole page-table management, so this is a good invariant. Signed-off-by: Konstantin Khlebnikov Cc: Hugh Dickins Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c4f38a849436..a9e99f3c18e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -511,6 +511,23 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) return NULL; } +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif +} + /* * Allocate and initialize an mm_struct. */ @@ -538,9 +555,7 @@ void __mmdrop(struct mm_struct *mm) mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON(mm->pmd_huge_pte); -#endif + check_mm(mm); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); -- cgit v1.2.3 From cc9a6c8776615f9c194ccf0b63a0aa5628235545 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 21 Mar 2012 16:34:11 -0700 Subject: cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman Cc: Miao Xie Cc: David Rientjes Cc: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 47 ++++++++++++++++++++--------------------------- include/linux/init_task.h | 8 ++++++++ include/linux/sched.h | 2 +- kernel/cpuset.c | 43 ++++++++----------------------------------- kernel/fork.c | 1 + mm/filemap.c | 11 +++++++---- mm/hugetlb.c | 15 +++++++++++---- mm/mempolicy.c | 28 +++++++++++++++++++++------- mm/page_alloc.c | 33 +++++++++++++++++++++++---------- mm/slab.c | 13 ++++++++----- mm/slub.c | 40 +++++++++++++++++++++++++--------------- mm/vmscan.c | 2 -- 12 files changed, 133 insertions(+), 110 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e9eaec522655..7a7e5fd2a277 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -89,42 +89,33 @@ extern void rebuild_sched_domains(void); extern void cpuset_print_task_mems_allowed(struct task_struct *p); /* - * reading current mems_allowed and mempolicy in the fastpath must protected - * by get_mems_allowed() + * get_mems_allowed is required when making decisions involving mems_allowed + * such as during page allocation. mems_allowed can be updated in parallel + * and depending on the new value an operation can fail potentially causing + * process failure. A retry loop with get_mems_allowed and put_mems_allowed + * prevents these artificial failures. */ -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { - current->mems_allowed_change_disable++; - - /* - * ensure that reading mems_allowed and mempolicy happens after the - * update of ->mems_allowed_change_disable. - * - * the write-side task finds ->mems_allowed_change_disable is not 0, - * and knows the read-side task is reading mems_allowed or mempolicy, - * so it will clear old bits lazily. - */ - smp_mb(); + return read_seqcount_begin(¤t->mems_allowed_seq); } -static inline void put_mems_allowed(void) +/* + * If this returns false, the operation that took place after get_mems_allowed + * may have failed. It is up to the caller to retry the operation if + * appropriate. + */ +static inline bool put_mems_allowed(unsigned int seq) { - /* - * ensure that reading mems_allowed and mempolicy before reducing - * mems_allowed_change_disable. - * - * the write-side task will know that the read-side task is still - * reading mems_allowed or mempolicy, don't clears old bits in the - * nodemask. - */ - smp_mb(); - --ACCESS_ONCE(current->mems_allowed_change_disable); + return !read_seqcount_retry(¤t->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { task_lock(current); + write_seqcount_begin(¤t->mems_allowed_seq); current->mems_allowed = nodemask; + write_seqcount_end(¤t->mems_allowed_seq); task_unlock(current); } @@ -234,12 +225,14 @@ static inline void set_mems_allowed(nodemask_t nodemask) { } -static inline void get_mems_allowed(void) +static inline unsigned int get_mems_allowed(void) { + return 0; } -static inline void put_mems_allowed(void) +static inline bool put_mems_allowed(unsigned int seq) { + return true; } #endif /* !CONFIG_CPUSETS */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f994d51f70f2..e4baff5f7ff4 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -29,6 +29,13 @@ extern struct fs_struct init_fs; #define INIT_GROUP_RWSEM(sig) #endif +#ifdef CONFIG_CPUSETS +#define INIT_CPUSET_SEQ \ + .mems_allowed_seq = SEQCNT_ZERO, +#else +#define INIT_CPUSET_SEQ +#endif + #define INIT_SIGNALS(sig) { \ .nr_threads = 1, \ .wait_chldexit = __WAIT_QUEUE_HEAD_INITIALIZER(sig.wait_chldexit),\ @@ -192,6 +199,7 @@ extern struct cred init_cred; INIT_FTRACE_GRAPH \ INIT_TRACE_RECURSION \ INIT_TASK_RCU_PREEMPT(tsk) \ + INIT_CPUSET_SEQ \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index e074e1e54f85..0c147a4260a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1514,7 +1514,7 @@ struct task_struct { #endif #ifdef CONFIG_CPUSETS nodemask_t mems_allowed; /* Protected by alloc_lock */ - int mems_allowed_change_disable; + seqcount_t mems_allowed_seq; /* Seqence no to catch updates */ int cpuset_mem_spread_rotor; int cpuset_slab_spread_rotor; #endif diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5d575836dba6..1010cc61931f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -964,7 +964,6 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, { bool need_loop; -repeat: /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. @@ -983,45 +982,19 @@ repeat: */ need_loop = task_has_mempolicy(tsk) || !nodes_intersects(*newmems, tsk->mems_allowed); - nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); - mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); - /* - * ensure checking ->mems_allowed_change_disable after setting all new - * allowed nodes. - * - * the read-side task can see an nodemask with new allowed nodes and - * old allowed nodes. and if it allocates page when cpuset clears newly - * disallowed ones continuous, it can see the new allowed bits. - * - * And if setting all new allowed nodes is after the checking, setting - * all new allowed nodes and clearing newly disallowed ones will be done - * continuous, and the read-side task may find no node to alloc page. - */ - smp_mb(); + if (need_loop) + write_seqcount_begin(&tsk->mems_allowed_seq); - /* - * Allocation of memory is very fast, we needn't sleep when waiting - * for the read-side. - */ - while (need_loop && ACCESS_ONCE(tsk->mems_allowed_change_disable)) { - task_unlock(tsk); - if (!task_curr(tsk)) - yield(); - goto repeat; - } - - /* - * ensure checking ->mems_allowed_change_disable before clearing all new - * disallowed nodes. - * - * if clearing newly disallowed bits before the checking, the read-side - * task may find no node to alloc page. - */ - smp_mb(); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); tsk->mems_allowed = *newmems; + + if (need_loop) + write_seqcount_end(&tsk->mems_allowed_seq); + task_unlock(tsk); } diff --git a/kernel/fork.c b/kernel/fork.c index a9e99f3c18e0..9cc227d54102 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1237,6 +1237,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; p->cpuset_slab_spread_rotor = NUMA_NO_NODE; + seqcount_init(&p->mems_allowed_seq); #endif #ifdef CONFIG_TRACE_IRQFLAGS p->irq_events = 0; diff --git a/mm/filemap.c b/mm/filemap.c index f3230604006c..843042045dc9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -499,10 +499,13 @@ struct page *__page_cache_alloc(gfp_t gfp) struct page *page; if (cpuset_do_page_mem_spread()) { - get_mems_allowed(); - n = cpuset_mem_spread_node(); - page = alloc_pages_exact_node(n, gfp, 0); - put_mems_allowed(); + unsigned int cpuset_mems_cookie; + do { + cpuset_mems_cookie = get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + } while (!put_mems_allowed(cpuset_mems_cookie) && !page); + return page; } return alloc_pages(gfp, 0); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 62f9fada4d6d..b1c314877334 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -454,14 +454,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - struct page *page = NULL; + struct page *page; struct mempolicy *mpol; nodemask_t *nodemask; struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + unsigned int cpuset_mems_cookie; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, htlb_alloc_mask, &mpol, &nodemask); /* @@ -488,10 +490,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, } } } -err: + mpol_cond_put(mpol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; + +err: + mpol_cond_put(mpol); + return NULL; } static void update_and_free_page(struct hstate *h, struct page *page) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71e1a523e209..cfb6c8678754 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1850,18 +1850,24 @@ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { - struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct mempolicy *pol; struct zonelist *zl; struct page *page; + unsigned int cpuset_mems_cookie; + +retry_cpuset: + pol = get_vma_policy(current, vma, addr); + cpuset_mems_cookie = get_mems_allowed(); - get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } zl = policy_zonelist(gfp, pol, node); @@ -1872,7 +1878,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } /* @@ -1880,7 +1887,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, */ page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); - put_mems_allowed(); + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; return page; } @@ -1907,11 +1915,14 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; struct page *page; + unsigned int cpuset_mems_cookie; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* * No reference counting needed for current->mempolicy * nor system default_policy @@ -1922,7 +1933,10 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(alloc_pages_current); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 673596ad9c80..40de6854b980 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2380,8 +2380,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; - struct page *page; + struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); + unsigned int cpuset_mems_cookie; gfp_mask &= gfp_allowed_mask; @@ -2400,15 +2401,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - get_mems_allowed(); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); - if (!preferred_zone) { - put_mems_allowed(); - return NULL; - } + if (!preferred_zone) + goto out; /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -2418,9 +2419,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - put_mems_allowed(); trace_mm_page_alloc(page, order, gfp_mask, migratetype); + +out: + /* + * When updating a task's mems_allowed, it is possible to race with + * parallel threads in such a way that an allocation can fail while + * the mask is being updated. If a page allocation is about to fail, + * check if the cpuset changed during allocation and if so, retry. + */ + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) + goto retry_cpuset; + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2634,13 +2645,15 @@ void si_meminfo_node(struct sysinfo *val, int nid) bool skip_free_areas_node(unsigned int flags, int nid) { bool ret = false; + unsigned int cpuset_mems_cookie; if (!(flags & SHOW_MEM_FILTER_NODES)) goto out; - get_mems_allowed(); - ret = !node_isset(nid, cpuset_current_mems_allowed); - put_mems_allowed(); + do { + cpuset_mems_cookie = get_mems_allowed(); + ret = !node_isset(nid, cpuset_current_mems_allowed); + } while (!put_mems_allowed(cpuset_mems_cookie)); out: return ret; } diff --git a/mm/slab.c b/mm/slab.c index f0bd7857ab3b..29c8716eb7a9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3284,12 +3284,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; nid_alloc = nid_here = numa_mem_id(); - get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); - put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3312,14 +3310,17 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; int nid; + unsigned int cpuset_mems_cookie; if (flags & __GFP_THISNODE) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + retry: /* * Look through allowed nodes for objects available @@ -3372,7 +3373,9 @@ retry: } } } - put_mems_allowed(); + + if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !obj)) + goto retry_cpuset; return obj; } diff --git a/mm/slub.c b/mm/slub.c index 4907563ef7ff..f4a6229848fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1581,6 +1581,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *object; + unsigned int cpuset_mems_cookie; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1604,23 +1605,32 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; - get_mems_allowed(); - zonelist = node_zonelist(slab_node(current->mempolicy), flags); - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - struct kmem_cache_node *n; - - n = get_node(s, zone_to_nid(zone)); - - if (n && cpuset_zone_allowed_hardwall(zone, flags) && - n->nr_partial > s->min_partial) { - object = get_partial_node(s, n, c); - if (object) { - put_mems_allowed(); - return object; + do { + cpuset_mems_cookie = get_mems_allowed(); + zonelist = node_zonelist(slab_node(current->mempolicy), flags); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + struct kmem_cache_node *n; + + n = get_node(s, zone_to_nid(zone)); + + if (n && cpuset_zone_allowed_hardwall(zone, flags) && + n->nr_partial > s->min_partial) { + object = get_partial_node(s, n, c); + if (object) { + /* + * Return the object even if + * put_mems_allowed indicated that + * the cpuset mems_allowed was + * updated in parallel. It's a + * harmless race between the alloc + * and the cpuset update. + */ + put_mems_allowed(cpuset_mems_cookie); + return object; + } } } - } - put_mems_allowed(); + } while (!put_mems_allowed(cpuset_mems_cookie)); #endif return NULL; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 440af1d899b9..55d86c9506f3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2343,7 +2343,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, unsigned long writeback_threshold; bool aborted_reclaim; - get_mems_allowed(); delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2407,7 +2406,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, out: delayacct_freepages_end(); - put_mems_allowed(); if (sc->nr_reclaimed) return sc->nr_reclaimed; -- cgit v1.2.3 From 05af2e104a0c282dcd9303431e1360750ba76de6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 21 Mar 2012 16:34:13 -0700 Subject: mm, counters: remove task argument to sync_mm_rss() and __sync_task_rss_stat() sync_mm_rss() can only be used for current to avoid race conditions in iterating and clearing its per-task counters. Remove the task argument for it and its helper function, __sync_task_rss_stat(), to avoid thinking it can be used safely for anything other than current. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/linux/mm.h | 4 ++-- kernel/exit.c | 2 +- mm/memory.c | 18 +++++++++--------- mm/mmu_context.c | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 3908544f5d18..6ed164d20d7d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -824,7 +824,7 @@ static int exec_mmap(struct mm_struct *mm) /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; - sync_mm_rss(tsk, old_mm); + sync_mm_rss(old_mm); mm_release(tsk, old_mm); if (old_mm) { diff --git a/include/linux/mm.h b/include/linux/mm.h index df17ff23d50e..ce2b2a3b2876 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,9 +1131,9 @@ static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, } #if defined(SPLIT_RSS_COUNTING) -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm); +void sync_mm_rss(struct mm_struct *mm); #else -static inline void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +static inline void sync_mm_rss(struct mm_struct *mm) { } #endif diff --git a/kernel/exit.c b/kernel/exit.c index 0ed15fed579f..d26acd3c1e2e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -934,7 +934,7 @@ void do_exit(long code) acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) - sync_mm_rss(tsk, tsk->mm); + sync_mm_rss(tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); diff --git a/mm/memory.c b/mm/memory.c index a5de734e14a7..2d27239ce4dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -125,17 +125,17 @@ core_initcall(init_zero_pfn); #if defined(SPLIT_RSS_COUNTING) -static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +static void __sync_task_rss_stat(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { - if (task->rss_stat.count[i]) { - add_mm_counter(mm, i, task->rss_stat.count[i]); - task->rss_stat.count[i] = 0; + if (current->rss_stat.count[i]) { + add_mm_counter(mm, i, current->rss_stat.count[i]); + current->rss_stat.count[i] = 0; } } - task->rss_stat.events = 0; + current->rss_stat.events = 0; } static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) @@ -157,12 +157,12 @@ static void check_sync_rss_stat(struct task_struct *task) if (unlikely(task != current)) return; if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - __sync_task_rss_stat(task, task->mm); + __sync_task_rss_stat(task->mm); } -void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +void sync_mm_rss(struct mm_struct *mm) { - __sync_task_rss_stat(task, mm); + __sync_task_rss_stat(mm); } #else /* SPLIT_RSS_COUNTING */ @@ -643,7 +643,7 @@ static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) int i; if (current->mm == mm) - sync_mm_rss(current, mm); + sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); diff --git a/mm/mmu_context.c b/mm/mmu_context.c index cf332bc0080a..3dcfaf4ed355 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -53,7 +53,7 @@ void unuse_mm(struct mm_struct *mm) struct task_struct *tsk = current; task_lock(tsk); - sync_mm_rss(tsk, mm); + sync_mm_rss(mm); tsk->mm = NULL; /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); -- cgit v1.2.3 From 42aee6c495e07dba7410b863a360db6bb9ec6d66 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:21 -0700 Subject: cgroup: revert ss_id_lock to spinlock Commit c1e2ee2dc436 ("memcg: replace ss->id_lock with a rwlock") has now been seen to cause the unfair behavior we should have expected from converting a spinlock to an rwlock: softlockup in cgroup_mkdir(), whose get_new_cssid() is waiting for the wlock, while there are 19 tasks using the rlock in css_get_next() to get on with their memcg workload (in an artificial test, admittedly). Yet lib/idr.c was made suitable for RCU way back: revert that commit, restoring ss->id_lock to a spinlock. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Eric Dumazet Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 501adb1b2f43..5a85b3415c1b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -498,7 +498,7 @@ struct cgroup_subsys { struct list_head sibling; /* used when use_id == true */ struct idr idr; - rwlock_t id_lock; + spinlock_t id_lock; /* should be defined only by modular subsystems */ struct module *module; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c6877fe9a831..8eb90f25bd7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4885,9 +4885,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) rcu_assign_pointer(id->css, NULL); rcu_assign_pointer(css->id, NULL); - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, id->id); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); kfree_rcu(id, rcu_head); } EXPORT_SYMBOL_GPL(free_css_id); @@ -4913,10 +4913,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) error = -ENOMEM; goto err_out; } - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); /* Don't use 0. allocates an ID of 1-65535 */ error = idr_get_new_above(&ss->idr, newid, 1, &myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); /* Returns error when there are no free spaces for new ID.*/ if (error) { @@ -4931,9 +4931,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) return newid; remove_idr: error = -ENOSPC; - write_lock(&ss->id_lock); + spin_lock(&ss->id_lock); idr_remove(&ss->idr, myid); - write_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); err_out: kfree(newid); return ERR_PTR(error); @@ -4945,7 +4945,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, { struct css_id *newid; - rwlock_init(&ss->id_lock); + spin_lock_init(&ss->id_lock); idr_init(&ss->idr); newid = get_new_cssid(ss, 0); @@ -5040,9 +5040,9 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - read_lock(&ss->id_lock); + spin_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - read_unlock(&ss->id_lock); + spin_unlock(&ss->id_lock); if (!tmp) break; -- cgit v1.2.3 From ca464d69b19120a826aa2534de2511a6f542edf5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 21 Mar 2012 16:34:21 -0700 Subject: memcg: let css_get_next() rely upon rcu_read_lock() Remove lock and unlock around css_get_next()'s call to idr_get_next(). memcg iterators (only users of css_get_next) already did rcu_read_lock(), and its comment demands that; but add a WARN_ON_ONCE to make sure of it. Signed-off-by: Hugh Dickins Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Eric Dumazet Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8eb90f25bd7b..391d5e991e5f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5033,6 +5033,8 @@ css_get_next(struct cgroup_subsys *ss, int id, return NULL; BUG_ON(!ss->use_id); + WARN_ON_ONCE(!rcu_read_lock_held()); + /* fill start point for scan */ tmpid = id; while (1) { @@ -5040,10 +5042,7 @@ css_get_next(struct cgroup_subsys *ss, int id, * scan next entry from bitmap(tree), tmpid is updated after * idr_get_next(). */ - spin_lock(&ss->id_lock); tmp = idr_get_next(&ss->idr, &tmpid); - spin_unlock(&ss->id_lock); - if (!tmp) break; if (tmp->depth >= depth && tmp->stack[depth] == rootid) { -- cgit v1.2.3 From 01de982abf8c9e10fc3089e10585cd2cc914bdab Mon Sep 17 00:00:00 2001 From: Wolfgang Mauerer Date: Thu, 22 Mar 2012 11:18:20 +0100 Subject: tracing: Fix ftrace stack trace entries 8 hex characters tell only half the tale for 64 bit CPUs, so use the appropriate length. Link: http://lkml.kernel.org/r/1332411501-8059-2-git-send-email-wolfgang.mauerer@siemens.com Cc: stable@vger.kernel.org Signed-off-by: Wolfgang Mauerer Signed-off-by: Steven Rostedt --- kernel/trace/trace_entries.h | 16 ++++++++++++---- kernel/trace/trace_export.c | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 93365907f219..205dcac89206 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -156,6 +156,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, #define FTRACE_STACK_ENTRIES 8 +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + FTRACE_ENTRY(kernel_stack, stack_entry, TRACE_STACK, @@ -165,8 +171,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry, __dynamic_array(unsigned long, caller ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], __entry->caller[6], __entry->caller[7]) @@ -181,8 +188,9 @@ FTRACE_ENTRY(user_stack, userstack_entry, __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ), - F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" - "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", __entry->caller[0], __entry->caller[1], __entry->caller[2], __entry->caller[3], __entry->caller[4], __entry->caller[5], __entry->caller[6], __entry->caller[7]) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index bbeec31e0ae3..ad4000c71be0 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -150,7 +150,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define __dynamic_array(type, item) #undef F_printk -#define F_printk(fmt, args...) #fmt ", " __stringify(args) +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ -- cgit v1.2.3 From 9fbe465efc76044dd87afe764db5464ae61aeabc Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 16 Mar 2012 13:17:13 +0100 Subject: kgdb: Respect that flush op is optional Not all kgdb I/O drivers implement a flush operation. Adjust gdbstub_exit accordingly. Signed-off-by: Jan Kiszka Signed-off-by: Jason Wessel --- kernel/debug/gdbstub.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index c22d8c28ad84..5a155742ae96 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1129,5 +1129,6 @@ void gdbstub_exit(int status) dbg_io_ops->write_char(hex_asc_lo(checksum)); /* make sure the output is flushed, lest the bootloader clobber it */ - dbg_io_ops->flush(); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); } -- cgit v1.2.3 From 2366e047840e33928803c0442176fb3991423da8 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 16 Mar 2012 14:20:41 -0500 Subject: kgdb,debug-core,gdbstub: Hook the reboot notifier for debugger detach The gdbstub and kdb should get detached if the system is rebooting. Calling gdbstub_exit() will set the proper debug core state and send a message to any debugger that is connected to correctly detach. An attached debugger will receive the exit code from include/linux/reboot.h based on SYS_HALT, SYS_REBOOT, etc... Reported-by: Jan Kiszka Signed-off-by: Jason Wessel --- kernel/debug/debug_core.c | 17 +++++++++++++++++ kernel/debug/gdbstub.c | 7 +++++++ 2 files changed, 24 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0d7c08784efb..3c1ad4e03543 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -784,6 +785,20 @@ void __init dbg_late_init(void) kdb_init(KDB_INIT_FULL); } +static int +dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) +{ + if (!dbg_kdb_mode) + gdbstub_exit(code); + return NOTIFY_DONE; +} + +static struct notifier_block dbg_reboot_notifier = { + .notifier_call = dbg_notify_reboot, + .next = NULL, + .priority = INT_MAX, +}; + static void kgdb_register_callbacks(void) { if (!kgdb_io_module_registered) { @@ -791,6 +806,7 @@ static void kgdb_register_callbacks(void) kgdb_arch_init(); if (!dbg_is_early) kgdb_arch_late(); + register_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_register(&panic_notifier_list, &kgdb_panic_event_nb); #ifdef CONFIG_MAGIC_SYSRQ @@ -812,6 +828,7 @@ static void kgdb_unregister_callbacks(void) */ if (kgdb_io_module_registered) { kgdb_io_module_registered = 0; + unregister_reboot_notifier(&dbg_reboot_notifier); atomic_notifier_chain_unregister(&panic_notifier_list, &kgdb_panic_event_nb); kgdb_arch_exit(); diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 5a155742ae96..ce615e064482 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -1111,6 +1111,13 @@ void gdbstub_exit(int status) unsigned char checksum, ch, buffer[3]; int loop; + if (!kgdb_connected) + return; + kgdb_connected = 0; + + if (!dbg_io_ops || dbg_kdb_mode) + return; + buffer[0] = 'W'; buffer[1] = hex_asc_hi(status); buffer[2] = hex_asc_lo(status); -- cgit v1.2.3 From 8f30d411767351656ea62c9e7612120f9b870b59 Mon Sep 17 00:00:00 2001 From: Andrei Warkentin Date: Tue, 28 Feb 2012 06:55:05 -0600 Subject: KDB: Fix usability issues relating to the 'enter' key. This fixes the following problems: 1) Typematic-repeat of 'enter' gives warning message and leaks make/break if KDB exits. Repeats look something like 0x1c 0x1c .... 0x9c 2) Use of 'keypad enter' gives warning message and leaks the ENTER break/make code out if KDB exits. KP ENTER repeats look someting like 0xe0 0x1c 0xe0 0x1c ... 0xe0 0x9c. 3) Lag on the order of seconds between "break" and "make" when expecting the enter "break" code. Seen under virtualized environments such as VMware ESX. The existing special enter handler tries to glob the enter break code, but this fails if the other (KP) enter was used, or if there was a key repeat. It also fails if you mashed some keys along with enter, and you ended up with a non-enter make or non-enter break code coming after the enter make code. So first, we modify the handler to handle these cases. But performing these actions on every enter is annoying since now you can't hold ENTER down to scroll d messages in KDB. Since this special behaviour is only necessary to handle the exiting KDB ('g' + ENTER) without leaking scancodes to the OS. This cleanup needs to get executed anytime the kdb_main loop exits. Tested on QEMU. Set a bp on atkbd.c to verify no scan code was leaked. Cc: Andrei Warkentin [jason.wessel@windriver.com: move cleanup calls to kdb_main.c] Signed-off-by: Andrei Warkentin Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_keyboard.c | 95 +++++++++++++++++++++++++++++++---------- kernel/debug/kdb/kdb_main.c | 3 ++ kernel/debug/kdb/kdb_private.h | 7 +++ 3 files changed, 83 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c index 4bca634975c0..118527aa60ea 100644 --- a/kernel/debug/kdb/kdb_keyboard.c +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -25,6 +25,7 @@ #define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ static int kbd_exists; +static int kbd_last_ret; /* * Check if the keyboard controller has a keypress for us. @@ -90,8 +91,11 @@ int kdb_get_kbd_char(void) return -1; } - if ((scancode & 0x80) != 0) + if ((scancode & 0x80) != 0) { + if (scancode == 0x9c) + kbd_last_ret = 0; return -1; + } scancode &= 0x7f; @@ -178,35 +182,82 @@ int kdb_get_kbd_char(void) return -1; /* ignore unprintables */ } - if ((scancode & 0x7f) == 0x1c) { - /* - * enter key. All done. Absorb the release scancode. - */ + if (scancode == 0x1c) { + kbd_last_ret = 1; + return 13; + } + + return keychar & 0xff; +} +EXPORT_SYMBOL_GPL(kdb_get_kbd_char); + +/* + * Best effort cleanup of ENTER break codes on leaving KDB. Called on + * exiting KDB, when we know we processed an ENTER or KP ENTER scan + * code. + */ +void kdb_kbd_cleanup_state(void) +{ + int scancode, scanstatus; + + /* + * Nothing to clean up, since either + * ENTER was never pressed, or has already + * gotten cleaned up. + */ + if (!kbd_last_ret) + return; + + kbd_last_ret = 0; + /* + * Enter key. Need to absorb the break code here, lest it gets + * leaked out if we exit KDB as the result of processing 'g'. + * + * This has several interesting implications: + * + Need to handle KP ENTER, which has break code 0xe0 0x9c. + * + Need to handle repeat ENTER and repeat KP ENTER. Repeats + * only get a break code at the end of the repeated + * sequence. This means we can't propagate the repeated key + * press, and must swallow it away. + * + Need to handle possible PS/2 mouse input. + * + Need to handle mashed keys. + */ + + while (1) { while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) - ; + cpu_relax(); /* - * Fetch the scancode + * Fetch the scancode. */ scancode = inb(KBD_DATA_REG); scanstatus = inb(KBD_STATUS_REG); - while (scanstatus & KBD_STAT_MOUSE_OBF) { - scancode = inb(KBD_DATA_REG); - scanstatus = inb(KBD_STATUS_REG); - } + /* + * Skip mouse input. + */ + if (scanstatus & KBD_STAT_MOUSE_OBF) + continue; - if (scancode != 0x9c) { - /* - * Wasn't an enter-release, why not? - */ - kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", - scancode, scanstatus); - } + /* + * If we see 0xe0, this is either a break code for KP + * ENTER, or a repeat make for KP ENTER. Either way, + * since the second byte is equivalent to an ENTER, + * skip the 0xe0 and try again. + * + * If we see 0x1c, this must be a repeat ENTER or KP + * ENTER (and we swallowed 0xe0 before). Try again. + * + * We can also see make and break codes for other keys + * mashed before or after pressing ENTER. Thus, if we + * see anything other than 0x9c, we have to try again. + * + * Note, if you held some key as ENTER was depressed, + * that break code would get leaked out. + */ + if (scancode != 0x9c) + continue; - return 13; + return; } - - return keychar & 0xff; } -EXPORT_SYMBOL_GPL(kdb_get_kbd_char); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index e2ae7349437f..67b847dfa2bb 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, if (KDB_STATE(DOING_SS)) KDB_STATE_CLEAR(SSBPT); + /* Clean up any keyboard devices before leaving */ + kdb_kbd_cleanup_state(); + return result; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index e381d105b40b..47c4e56e513b 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -246,6 +246,13 @@ extern void debug_kusage(void); extern void kdb_set_current_task(struct task_struct *); extern struct task_struct *kdb_current_task; + +#ifdef CONFIG_KDB_KEYBOARD +extern void kdb_kbd_cleanup_state(void); +#else /* ! CONFIG_KDB_KEYBOARD */ +#define kdb_kbd_cleanup_state() +#endif /* ! CONFIG_KDB_KEYBOARD */ + #ifdef CONFIG_MODULES extern struct list_head *kdb_modules; #endif /* CONFIG_MODULES */ -- cgit v1.2.3 From bec4d62ead8096e433d624d9339893f50badd992 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 19 Mar 2012 19:35:55 -0500 Subject: kgdb,debug_core: add the ability to control the reboot notifier Sometimes it is desirable to stop the kernel debugger before allowing a system to reboot either with kdb or kgdb. This patch adds the ability to turn the reboot notifier on and off or enter the debugger and stop kernel execution before rebooting. It is possible to change the setting after booting the kernel with the following: echo 1 > /sys/module/debug_core/parameters/kgdbreboot It is also possible to change this setting using kdb / kgdb to manipulate the variable directly. Using KDB: mm kgdbreboot 1 Using gdb: set kgdbreboot=1 Reported-by: Jan Kiszka Signed-off-by: Jason Wessel --- Documentation/DocBook/kgdb.tmpl | 17 +++++++++++++++++ kernel/debug/debug_core.c | 16 ++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/Documentation/DocBook/kgdb.tmpl b/Documentation/DocBook/kgdb.tmpl index d71b57fcf116..4ee4ba3509fc 100644 --- a/Documentation/DocBook/kgdb.tmpl +++ b/Documentation/DocBook/kgdb.tmpl @@ -361,6 +361,23 @@ It is possible to use this option with kgdboc on a tty that is not a system console. + + + Run time parameter: kgdbreboot + The kgdbreboot feature allows you to change how the debugger + deals with the reboot notification. You have 3 choices for the + behavior. The default behavior is always set to 0. + + echo -1 > /sys/module/debug_core/parameters/kgdbreboot + Ignore the reboot notification entirely. + + echo 0 > /sys/module/debug_core/parameters/kgdbreboot + Send the detach message to any attached debugger client. + + echo 1 > /sys/module/debug_core/parameters/kgdbreboot + Enter the debugger on reboot notify. + + diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 3c1ad4e03543..3f88a45e6f0a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -76,6 +76,8 @@ static int exception_level; struct kgdb_io *dbg_io_ops; static DEFINE_SPINLOCK(kgdb_registration_lock); +/* Action for the reboot notifiter, a global allow kdb to change it */ +static int kgdbreboot; /* kgdb console driver is loaded */ static int kgdb_con_registered; /* determine if kgdb console output should be used */ @@ -97,6 +99,7 @@ static int __init opt_kgdb_con(char *str) early_param("kgdbcon", opt_kgdb_con); module_param(kgdb_use_con, int, 0644); +module_param(kgdbreboot, int, 0644); /* * Holds information about breakpoints in a kernel. These breakpoints are @@ -788,8 +791,21 @@ void __init dbg_late_init(void) static int dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { + /* + * Take the following action on reboot notify depending on value: + * 1 == Enter debugger + * 0 == [the default] detatch debug client + * -1 == Do nothing... and use this until the board resets + */ + switch (kgdbreboot) { + case 1: + kgdb_breakpoint(); + case -1: + goto done; + } if (!dbg_kdb_mode) gdbstub_exit(code); +done: return NOTIFY_DONE; } -- cgit v1.2.3 From b8adde8ddec9ff62a21564fa8020b5463e70d4de Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Wed, 21 Sep 2011 13:19:12 -0700 Subject: kdb: Avoid using dbg_io_ops until it is initialized This fixes a bug with setting a breakpoint during kdb initialization (from kdb_cmds). Any call to kdb_printf() before the initialization of the kgdboc serial console driver (which happens much later during bootup than kdb_init), results in kernel panic due to the use of dbg_io_ops before it is initialized. Signed-off-by: Tim Bird Signed-off-by: Jason Wessel --- kernel/debug/kdb/kdb_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 4802eb5840e1..9b5f17da1c56 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -689,7 +689,7 @@ kdb_printit: if (!dbg_kdb_mode && kgdb_connected) { gdbstub_msg_write(kdb_buffer, retlen); } else { - if (!dbg_io_ops->is_console) { + if (dbg_io_ops && !dbg_io_ops->is_console) { len = strlen(kdb_buffer); cp = kdb_buffer; while (len--) { -- cgit v1.2.3 From 1ba0c1720eb0de2d0f3abf84c0b128d10af520d1 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 21 Sep 2011 13:07:47 -0700 Subject: kdb: Add message about CONFIG_DEBUG_RODATA on failure to install breakpoint On x86, if CONFIG_DEBUG_RODATA is set, one cannot set breakpoints via KDB. Apparently this is a well-known problem, as at least one distribution now ships with both KDB enabled and CONFIG_DEBUG_RODATA=y for security reasons. This patch adds an printk message to the breakpoint failure case, in order to provide suggestions about how to use the debugger. Reported-by: Tim Bird Signed-off-by: Jason Wessel Acked-by: Tim Bird --- kernel/debug/kdb/kdb_bp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index 20059ef4459a..8418c2f8ec5d 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); +#ifdef CONFIG_DEBUG_RODATA + if (!bp->bp_type) { + kdb_printf("Software breakpoints are unavailable.\n" + " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " OR use hw breaks: help bph\n"); + } +#endif return 1; } return 0; -- cgit v1.2.3 From 6b43ae8a619d17c4935c3320d2ef9e92bdeed05d Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 15 Mar 2012 13:04:03 -0700 Subject: ntp: Fix leap-second hrtimer livelock Since commit 7dffa3c673fbcf835cd7be80bb4aec8ad3f51168 the ntp subsystem has used an hrtimer for triggering the leapsecond adjustment. However, this can cause a potential livelock. Thomas diagnosed this as the following pattern: CPU 0 CPU 1 do_adjtimex() spin_lock_irq(&ntp_lock); process_adjtimex_modes(); timer_interrupt() process_adj_status(); do_timer() ntp_start_leap_timer(); write_lock(&xtime_lock); hrtimer_start(); update_wall_time(); hrtimer_reprogram(); ntp_tick_length() tick_program_event() spin_lock(&ntp_lock); clockevents_program_event() ktime_get() seq = req_seqbegin(xtime_lock); This patch tries to avoid the problem by reverting back to not using an hrtimer to inject leapseconds, and instead we handle the leapsecond processing in the second_overflow() function. The downside to this change is that on systems that support highres timers, the leap second processing will occur on a HZ tick boundary, (ie: ~1-10ms, depending on HZ) after the leap second instead of possibly sooner (~34us in my tests w/ x86_64 lapic). This patch applies on top of tip/timers/core. CC: Sasha Levin CC: Thomas Gleixner Reported-by: Sasha Levin Diagnoised-by: Thomas Gleixner Tested-by: Sasha Levin Signed-off-by: John Stultz --- include/linux/timex.h | 2 +- kernel/time/ntp.c | 128 +++++++++++++++------------------------------- kernel/time/timekeeping.c | 20 +++----- 3 files changed, 48 insertions(+), 102 deletions(-) (limited to 'kernel') diff --git a/include/linux/timex.h b/include/linux/timex.h index b75e1864ed19..99bc88b1fc02 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -252,7 +252,7 @@ extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); -extern void second_overflow(void); +extern int second_overflow(unsigned long secs); extern int do_adjtimex(struct timex *); extern void hardpps(const struct timespec *, const struct timespec *); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 6e039b144daf..3d17ebd47fa2 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -34,8 +34,6 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; -static struct hrtimer leap_timer; - #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -381,70 +379,63 @@ u64 ntp_tick_length(void) /* - * Leap second processing. If in leap-insert state at the end of the - * day, the system clock is set back one second; if in leap-delete - * state, the system clock is set ahead one second. + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + * + * Also handles leap second processing, and returns leap offset */ -static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +int second_overflow(unsigned long secs) { - enum hrtimer_restart res = HRTIMER_NORESTART; - unsigned long flags; + s64 delta; int leap = 0; + unsigned long flags; spin_lock_irqsave(&ntp_lock, flags); + + /* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ switch (time_state) { case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; break; case TIME_INS: - leap = -1; - time_state = TIME_OOP; - printk(KERN_NOTICE - "Clock: inserting leap second 23:59:60 UTC\n"); - hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); - res = HRTIMER_RESTART; + if (secs % 86400 == 0) { + leap = -1; + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + } break; case TIME_DEL: - leap = 1; - time_tai--; - time_state = TIME_WAIT; - printk(KERN_NOTICE - "Clock: deleting leap second 23:59:59 UTC\n"); + if ((secs + 1) % 86400 == 0) { + leap = 1; + time_tai--; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + } break; case TIME_OOP: time_tai++; time_state = TIME_WAIT; - /* fall through */ + break; + case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } - spin_unlock_irqrestore(&ntp_lock, flags); - - /* - * We have to call this outside of the ntp_lock to keep - * the proper locking hierarchy - */ - if (leap) - timekeeping_leap_insert(leap); - - return res; -} - -/* - * this routine handles the overflow of the microsecond field - * - * The tricky bits of code to handle the accurate clock support - * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. - * They were originally developed for SUN and DEC kernels. - * All the kudos should go to Dave for this stuff. - */ -void second_overflow(void) -{ - s64 delta; - unsigned long flags; - spin_lock_irqsave(&ntp_lock, flags); /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; @@ -481,8 +472,13 @@ void second_overflow(void) tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; + + + out: spin_unlock_irqrestore(&ntp_lock, flags); + + return leap; } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -544,27 +540,6 @@ static void notify_cmos_timer(void) static inline void notify_cmos_timer(void) { } #endif -/* - * Start the leap seconds timer: - */ -static inline void ntp_start_leap_timer(struct timespec *ts) -{ - long now = ts->tv_sec; - - if (time_status & STA_INS) { - time_state = TIME_INS; - now += 86400 - now % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - - return; - } - - if (time_status & STA_DEL) { - time_state = TIME_DEL; - now += 86400 - (now + 1) % 86400; - hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); - } -} /* * Propagate a new txc->status value into the NTP state: @@ -589,22 +564,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts) time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; - switch (time_state) { - case TIME_OK: - ntp_start_leap_timer(ts); - break; - case TIME_INS: - case TIME_DEL: - time_state = TIME_OK; - ntp_start_leap_timer(ts); - case TIME_WAIT: - if (!(time_status & (STA_INS | STA_DEL))) - time_state = TIME_OK; - break; - case TIME_OOP: - hrtimer_restart(&leap_timer); - break; - } } /* * Called with the xtime lock held, so we can access and modify @@ -686,9 +645,6 @@ int do_adjtimex(struct timex *txc) (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; - - if (txc->modes & ADJ_STATUS && time_state != TIME_OK) - hrtimer_cancel(&leap_timer); } if (txc->modes & ADJ_SETOFFSET) { @@ -1010,6 +966,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); - hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); - leap_timer.function = ntp_leap_second; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b53da5ecbea2..5d76e09ddd3d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -184,18 +184,6 @@ static void timekeeping_update(bool clearntp) } -void timekeeping_leap_insert(int leapsecond) -{ - unsigned long flags; - - write_seqlock_irqsave(&timekeeper.lock, flags); - timekeeper.xtime.tv_sec += leapsecond; - timekeeper.wall_to_monotonic.tv_sec -= leapsecond; - timekeeping_update(false); - write_sequnlock_irqrestore(&timekeeper.lock, flags); - -} - /** * timekeeping_forward_now - update clock to the current time * @@ -969,9 +957,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; while (timekeeper.xtime_nsec >= nsecps) { + int leap; timekeeper.xtime_nsec -= nsecps; timekeeper.xtime.tv_sec++; - second_overflow(); + leap = second_overflow(timekeeper.xtime.tv_sec); + timekeeper.xtime.tv_sec += leap; } /* Accumulate raw time */ @@ -1082,9 +1072,11 @@ static void update_wall_time(void) * xtime.tv_nsec isn't larger then NSEC_PER_SEC */ if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { + int leap; timekeeper.xtime.tv_nsec -= NSEC_PER_SEC; timekeeper.xtime.tv_sec++; - second_overflow(); + leap = second_overflow(timekeeper.xtime.tv_sec); + timekeeper.xtime.tv_sec += leap; } timekeeping_update(false); -- cgit v1.2.3 From c7206205d00ab375839bd6c7ddb247d600693c09 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Mar 2012 17:26:36 +0100 Subject: perf: Fix mmap_page capabilities and docs Complete the syscall-less self-profiling feature and address all complaints, namely: - capabilities, so we can detect what is actually available at runtime Add a capabilities field to perf_event_mmap_page to indicate what is actually available for use. - on x86: RDPMC weirdness due to being 40/48 bits and not sign-extending properly. - ABI documentation as to how all this stuff works. Also improve the documentation for the new features. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Vince Weaver Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Link: http://lkml.kernel.org/r/1332433596.2487.33.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 ++++- include/linux/perf_event.h | 83 +++++++++++++++++++++++++++++++++++----- kernel/events/core.c | 4 +- 3 files changed, 84 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 453ac9497574..4ef8104958ee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1622,6 +1622,9 @@ static int x86_pmu_event_idx(struct perf_event *event) { int idx = event->hw.idx; + if (!x86_pmu.attr_rdpmc) + return 0; + if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { idx -= X86_PMC_IDX_FIXED; idx |= 1 << 30; @@ -1706,14 +1709,19 @@ static struct pmu pmu = { .flush_branch_stack = x86_pmu_flush_branch_stack, }; -void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { + userpg->cap_usr_time = 0; + userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->pmc_width = x86_pmu.cntval_bits; + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; + userpg->cap_usr_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 57ae485e80fc..ca9ed4e6a286 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -299,18 +299,31 @@ struct perf_event_mmap_page { /* * Bits needed to read the hw events in user-space. * - * u32 seq; - * s64 count; + * u32 seq, time_mult, time_shift, idx, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; * * do { * seq = pc->lock; - * * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * idx = pc->index; + * count = pc->offset; + * if (pc->cap_usr_rdpmc && idx) { + * width = pc->pmc_width; + * pmc = rdpmc(idx - 1); + * } * * barrier(); * } while (pc->lock != seq); @@ -323,14 +336,57 @@ struct perf_event_mmap_page { __s64 offset; /* add to hardware event value */ __u64 time_enabled; /* time event active */ __u64 time_running; /* time event on cpu */ - __u32 time_mult, time_shift; + union { + __u64 capabilities; + __u64 cap_usr_time : 1, + cap_usr_rdpmc : 1, + cap_____res : 62; + }; + + /* + * If cap_usr_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & ((1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if idx), improving the scaling: + * + * enabled += delta; + * if (idx) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; __u64 time_offset; /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[121]; /* align to 1k */ + __u64 __reserved[120]; /* align to 1k */ /* * Control data for the mmap() data buffer. @@ -347,6 +403,13 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ }; +/* + * Build time assertion that we keep the data_head at the intended location. + * IOW, validation we got the __reserved[] size right. + */ +extern char __assert_mmap_data_head_offset + [1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)]; + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index c61234b1a988..dc3b05272511 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } -void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) +void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event) userpg->time_running = running + atomic64_read(&event->child_total_time_running); - perf_update_user_clock(userpg, now); + arch_perf_update_userpage(userpg, now); barrier(); ++userpg->lock; -- cgit v1.2.3 From 6c16a6dcb05e51ace340ff7bc6dbe647f1593528 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Mar 2012 13:07:16 -0700 Subject: sched: Fix compiler warning about declared inline after use kernel/sched/fair.c:420: warning: 'account_cfs_rq_runtime' declared inline after being called kernel/sched/fair.c:420: warning: previous declaration of 'account_cfs_rq_runtime' was here kernel/sched/fair.c:1165: warning: 'return_cfs_rq_runtime' declared inlineafter being called kernel/sched/fair.c:1165: warning: previous declaration of 'return_cfs_rq_runtime' was here Reported-by: Andrew Morton Signed-off-by: Peter Zijlstra Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20120321200717.49BB4A024E@akpm.mtv.corp.google.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11f3979bad2a..258f430d71a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) #endif /* CONFIG_FAIR_GROUP_SCHED */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec); +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec); /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -1162,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) __clear_buddies_skip(se); } -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -1546,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, resched_task(rq_of(cfs_rq)->curr); } -static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) { if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled) return; @@ -2073,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq) } #else /* CONFIG_CFS_BANDWIDTH */ -static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, - unsigned long delta_exec) {} +static __always_inline +void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {} static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} -static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} +static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) { -- cgit v1.2.3 From e335e3eb82dada2765297f6ba501afc7555aba10 Mon Sep 17 00:00:00 2001 From: Raghavendra K T Date: Thu, 22 Mar 2012 15:25:08 +0530 Subject: locking/kconfig: Simplify INLINE_SPIN_UNLOCK usage Get rid of INLINE_SPIN_UNLOCK entirely replacing it with UNINLINE_SPIN_UNLOCK instead of the reverse meaning. Whoever wants to change the default spinlock inlining behavior and uninline the spinlocks for some weird reason, such as spinlock debugging, paravirt etc. can now all just select UNINLINE_SPIN_UNLOCK Original discussion at: https://lkml.org/lkml/2012/3/21/357 Suggested-by: Linus Torvalds Signed-off-by: Raghavendra K T Cc: Linus Torvalds Cc: Ralf Baechle Cc: Chris Metcalf Cc: Chris Zankel Cc: linux-mips@linux-mips.org Link: http://lkml.kernel.org/r/20120322095502.30866.75756.sendpatchset@codeblue [ tidied up the changelog a bit ] Signed-off-by: Ingo Molnar --- arch/mips/configs/db1300_defconfig | 2 +- arch/xtensa/configs/iss_defconfig | 2 +- include/linux/spinlock_api_smp.h | 2 +- kernel/Kconfig.locks | 4 ++-- kernel/Kconfig.preempt | 1 + kernel/spinlock.c | 2 +- lib/Kconfig.debug | 1 + 7 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/mips/configs/db1300_defconfig b/arch/mips/configs/db1300_defconfig index c38b190151c4..3590ab5d9791 100644 --- a/arch/mips/configs/db1300_defconfig +++ b/arch/mips/configs/db1300_defconfig @@ -133,7 +133,7 @@ CONFIG_BLK_DEV_BSG=y CONFIG_IOSCHED_NOOP=y CONFIG_DEFAULT_NOOP=y CONFIG_DEFAULT_IOSCHED="noop" -CONFIG_INLINE_SPIN_UNLOCK=y +# CONFIG_UNINLINE_SPIN_UNLOCK is not set CONFIG_INLINE_SPIN_UNLOCK_IRQ=y CONFIG_INLINE_READ_UNLOCK=y CONFIG_INLINE_READ_UNLOCK_IRQ=y diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index f932b30b47fb..ddab37b24741 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -113,7 +113,7 @@ CONFIG_DEFAULT_IOSCHED="noop" # CONFIG_INLINE_SPIN_LOCK_BH is not set # CONFIG_INLINE_SPIN_LOCK_IRQ is not set # CONFIG_INLINE_SPIN_LOCK_IRQSAVE is not set -CONFIG_INLINE_SPIN_UNLOCK=y +# CONFIG_UNINLINE_SPIN_UNLOCK is not set # CONFIG_INLINE_SPIN_UNLOCK_BH is not set CONFIG_INLINE_SPIN_UNLOCK_IRQ=y # CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE is not set diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index e253ccd7a604..51df117abe46 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -67,7 +67,7 @@ _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) #define _raw_spin_trylock_bh(lock) __raw_spin_trylock_bh(lock) #endif -#ifdef CONFIG_INLINE_SPIN_UNLOCK +#ifndef CONFIG_UNINLINE_SPIN_UNLOCK #define _raw_spin_unlock(lock) __raw_spin_unlock(lock) #endif diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 5068e2a4e75f..2251882daf53 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ ARCH_INLINE_SPIN_LOCK_IRQSAVE -config INLINE_SPIN_UNLOCK - def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) +config UNINLINE_SPIN_UNLOCK + bool config INLINE_SPIN_UNLOCK_BH def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 24e7cb0ba26a..3f9c97419f02 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" select PREEMPT_COUNT + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 84c7d96918bf..5cdd8065a3ce 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_lock_bh); #endif -#ifndef CONFIG_INLINE_SPIN_UNLOCK +#ifdef CONFIG_UNINLINE_SPIN_UNLOCK void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 05037dc9bde7..f32a41f2a82a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -499,6 +499,7 @@ config RT_MUTEX_TESTER config DEBUG_SPINLOCK bool "Spinlock and rw-lock debugging: basic checks" depends on DEBUG_KERNEL + select UNINLINE_SPIN_UNLOCK help Say Y here and build SMP to catch missing spinlock initialization and certain other kinds of spinlock errors commonly made. This is -- cgit v1.2.3 From ad30dfa94c5cc23931c822922a50bd163ab293a5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 23 Mar 2012 15:52:25 -0700 Subject: alarmtimer: Make sure we initialize the rtctimer jonghwan Choi reported seeing warnings with the alarmtimer code at suspend/resume time, and pointed out that the rtctimer isn't being properly initialized. This patch corrects this issue. Reported-by: jonghwan Choi Signed-off-by: John Stultz --- kernel/time/alarmtimer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a46f5d64504..c16548807f1e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -46,9 +46,10 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct rtc_timer rtctimer; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ -static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); @@ -783,6 +784,8 @@ static int __init alarmtimer_init(void) .nsleep = alarm_timer_nsleep, }; + rtc_timer_init(&rtctimer, NULL, NULL); + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); -- cgit v1.2.3 From e919cfd42da54d400e7e0385f22cae3672dcf874 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 22 Mar 2012 19:14:46 -0700 Subject: time: Avoid scary backtraces when warning of > 11% adj Folks have been getting a number of warnings about time adjustments > 11%. The WARN_ON leaves a big useless backtrace so this patch removes it for a printk_once(). I'm still working to narrow down the cause of the > 11% adjustment. Signed-off-by: John Stultz --- kernel/time/timekeeping.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5d76e09ddd3d..16a175bed355 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -869,13 +869,15 @@ static void timekeeping_adjust(s64 offset) } else /* No adjustment needed */ return; - WARN_ONCE(timekeeper.clock->maxadj && - (timekeeper.mult + adj > timekeeper.clock->mult + - timekeeper.clock->maxadj), - "Adjusting %s more then 11%% (%ld vs %ld)\n", + if (unlikely(timekeeper.clock->maxadj && + (timekeeper.mult + adj > + timekeeper.clock->mult + timekeeper.clock->maxadj))) { + printk_once(KERN_WARNING + "Adjusting %s more than 11%% (%ld vs %ld)\n", timekeeper.clock->name, (long)timekeeper.mult + adj, (long)timekeeper.clock->mult + timekeeper.clock->maxadj); + } /* * So the following can be confusing. * -- cgit v1.2.3 From 335dd85895abeca1957d5eaa3013dfe8dc60c7d7 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Sat, 11 Feb 2012 17:54:59 -0200 Subject: time: remove no_sync_cmos_clock Commit 9863c90f682fba34cdc26c3437e8c00da6c83fa4 (x86, vmware: Remove deprecated VMI kernel support) removed the only place which set no_sync_cmos_clock. Since that commit, this variable is never set. Signed-off-by: Cesar Eduardo Barros Signed-off-by: John Stultz --- include/linux/time.h | 1 - kernel/time/ntp.c | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index b3061782dec3..97734e9409c6 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -116,7 +116,6 @@ static inline struct timespec timespec_sub(struct timespec lhs, extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); extern int update_persistent_clock(struct timespec now); -extern int no_sync_cmos_clock __read_mostly; void timekeeping_init(void); extern int timekeeping_suspended; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 3d17ebd47fa2..f03fd83b170b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -483,9 +483,6 @@ out: #ifdef CONFIG_GENERIC_CMOS_UPDATE -/* Disable the cmos update - used by virtualization and embedded */ -int no_sync_cmos_clock __read_mostly; - static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); @@ -532,8 +529,7 @@ static void sync_cmos_clock(struct work_struct *work) static void notify_cmos_timer(void) { - if (!no_sync_cmos_clock) - schedule_delayed_work(&sync_cmos_work, 0); + schedule_delayed_work(&sync_cmos_work, 0); } #else -- cgit v1.2.3 From 88b28adf6fcdd6d10a1cfc7765bb200d7366a265 Mon Sep 17 00:00:00 2001 From: Jim Cromie Date: Wed, 14 Mar 2012 21:28:56 -0600 Subject: kernel-time: fix s/then/than/ spelling errors Use than for comparisons, like more than. CC: John Stultz Signed-off-by: Jim Cromie Signed-off-by: John Stultz --- kernel/time/clocksource.c | 2 +- kernel/time/timekeeping.c | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index a45ca167ab24..c9583382141a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) { u64 ret; /* - * We won't try to correct for more then 11% adjustments (110,000 ppm), + * We won't try to correct for more than 11% adjustments (110,000 ppm), */ ret = (u64)cs->mult * 11; do_div(ret,100); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 16a175bed355..51b98568ba4d 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -822,7 +822,7 @@ static void timekeeping_adjust(s64 offset) int adj; /* - * The point of this is to check if the error is greater then half + * The point of this is to check if the error is greater than half * an interval. * * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs. @@ -830,7 +830,7 @@ static void timekeeping_adjust(s64 offset) * Note we subtract one in the shift, so that error is really error*2. * This "saves" dividing(shifting) interval twice, but keeps the * (error > interval) comparison as still measuring if error is - * larger then half an interval. + * larger than half an interval. * * Note: It does not "save" on aggravation when reading the code. */ @@ -838,7 +838,7 @@ static void timekeeping_adjust(s64 offset) if (error > interval) { /* * We now divide error by 4(via shift), which checks if - * the error is greater then twice the interval. + * the error is greater than twice the interval. * If it is greater, we need a bigadjust, if its smaller, * we can adjust by 1. */ @@ -949,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift) u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; u64 raw_nsecs; - /* If the offset is smaller then a shifted interval, do nothing */ + /* If the offset is smaller than a shifted interval, do nothing */ if (offset < timekeeper.cycle_interval<= timekeeper.cycle_interval) { @@ -1071,7 +1071,7 @@ static void update_wall_time(void) /* * Finally, make sure that after the rounding - * xtime.tv_nsec isn't larger then NSEC_PER_SEC + * xtime.tv_nsec isn't larger than NSEC_PER_SEC */ if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) { int leap; -- cgit v1.2.3 From ebec18a6d3aa1e7d84aab16225e87fd25170ec2b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Fri, 23 Mar 2012 15:01:54 -0700 Subject: prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision Userspace service managers/supervisors need to track their started services. Many services daemonize by double-forking and get implicitly re-parented to PID 1. The service manager will no longer be able to receive the SIGCHLD signals for them, and is no longer in charge of reaping the children with wait(). All information about the children is lost at the moment PID 1 cleans up the re-parented processes. With this prctl, a service manager process can mark itself as a sort of 'sub-init', able to stay as the parent for all orphaned processes created by the started services. All SIGCHLD signals will be delivered to the service manager. Receiving SIGCHLD and doing wait() is in cases of a service-manager much preferred over any possible asynchronous notification about specific PIDs, because the service manager has full access to the child process data in /proc and the PID can not be re-used until the wait(), the service-manager itself is in charge of, has happened. As a side effect, the relevant parent PID information does not get lost by a double-fork, which results in a more elaborate process tree and 'ps' output: before: # ps afx 253 ? Ss 0:00 /bin/dbus-daemon --system --nofork 294 ? Sl 0:00 /usr/libexec/polkit-1/polkitd 328 ? S 0:00 /usr/sbin/modem-manager 608 ? Sl 0:00 /usr/libexec/colord 658 ? Sl 0:00 /usr/libexec/upowerd 819 ? Sl 0:00 /usr/libexec/imsettings-daemon 916 ? Sl 0:00 /usr/libexec/udisks-daemon 917 ? S 0:00 \_ udisks-daemon: not polling any devices after: # ps afx 294 ? Ss 0:00 /bin/dbus-daemon --system --nofork 426 ? Sl 0:00 \_ /usr/libexec/polkit-1/polkitd 449 ? S 0:00 \_ /usr/sbin/modem-manager 635 ? Sl 0:00 \_ /usr/libexec/colord 705 ? Sl 0:00 \_ /usr/libexec/upowerd 959 ? Sl 0:00 \_ /usr/libexec/udisks-daemon 960 ? S 0:00 | \_ udisks-daemon: not polling any devices 977 ? Sl 0:00 \_ /usr/libexec/packagekitd This prctl is orthogonal to PID namespaces. PID namespaces are isolated from each other, while a service management process usually requires the services to live in the same namespace, to be able to talk to each other. Users of this will be the systemd per-user instance, which provides init-like functionality for the user's login session and D-Bus, which activates bus services on-demand. Both need init-like capabilities to be able to properly keep track of the services they start. Many thanks to Oleg for several rounds of review and insights. [akpm@linux-foundation.org: fix comment layout and spelling] [akpm@linux-foundation.org: add lengthy code comment from Oleg] Reviewed-by: Oleg Nesterov Signed-off-by: Lennart Poettering Signed-off-by: Kay Sievers Acked-by: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 3 +++ include/linux/sched.h | 12 ++++++++++++ kernel/exit.c | 33 ++++++++++++++++++++++++++++----- kernel/fork.c | 3 +++ kernel/sys.c | 8 ++++++++ 5 files changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index a0413ac3abe8..e0cfec2490aa 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -121,4 +121,7 @@ #define PR_SET_PTRACER 0x59616d61 # define PR_SET_PTRACER_ANY ((unsigned long)-1) +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 + #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 0c147a4260a5..0c3854b0d4b1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -553,6 +553,18 @@ struct signal_struct { int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ + /* + * PR_SET_CHILD_SUBREAPER marks a process, like a service + * manager, to re-parent orphan (double-forking) child processes + * to this process instead of 'init'. The service manager is + * able to receive SIGCHLD signals and is able to investigate + * the process until it calls wait(). All children of this + * process will inherit a flag if they should look for a + * child_subreaper process at exit. + */ + unsigned int is_child_subreaper:1; + unsigned int has_child_subreaper:1; + /* POSIX.1b Interval Timers */ struct list_head posix_timers; diff --git a/kernel/exit.c b/kernel/exit.c index 16b07bfac224..456329fd4ea3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk) } /* - * When we die, we re-parent all our children. - * Try to give them to another thread in our thread - * group, and if no such member exists, give it to - * the child reaper process (ie "init") in our pid - * space. + * When we die, we re-parent all our children, and try to: + * 1. give them to another thread in our thread group, if such a member exists + * 2. give it to the first ancestor process which prctl'd itself as a + * child_subreaper for its children (like a service manager) + * 3. give it to the init process (PID 1) in our pid namespace */ static struct task_struct *find_new_reaper(struct task_struct *father) __releases(&tasklist_lock) @@ -722,6 +722,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father) * forget_original_parent() must move them somewhere. */ pid_ns->child_reaper = init_pid_ns.child_reaper; + } else if (father->signal->has_child_subreaper) { + struct task_struct *reaper; + + /* + * Find the first ancestor marked as child_subreaper. + * Note that the code below checks same_thread_group(reaper, + * pid_ns->child_reaper). This is what we need to DTRT in a + * PID namespace. However we still need the check above, see + * http://marc.info/?l=linux-kernel&m=131385460420380 + */ + for (reaper = father->real_parent; + reaper != &init_task; + reaper = reaper->real_parent) { + if (same_thread_group(reaper, pid_ns->child_reaper)) + break; + if (!reaper->signal->is_child_subreaper) + continue; + thread = reaper; + do { + if (!(thread->flags & PF_EXITING)) + return reaper; + } while_each_thread(reaper, thread); + } } return pid_ns->child_reaper; diff --git a/kernel/fork.c b/kernel/fork.c index 37674ec55cde..b9372a0bff18 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; + sig->has_child_subreaper = current->signal->has_child_subreaper || + current->signal->is_child_subreaper; + mutex_init(&sig->cred_guard_mutex); return 0; diff --git a/kernel/sys.c b/kernel/sys.c index 888d227fd195..9eb7fcab8df6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1962,6 +1962,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_MM: error = prctl_set_mm(arg2, arg3, arg4, arg5); break; + case PR_SET_CHILD_SUBREAPER: + me->signal->is_child_subreaper = !!arg2; + error = 0; + break; + case PR_GET_CHILD_SUBREAPER: + error = put_user(me->signal->is_child_subreaper, + (int __user *) arg2); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 397a21f24d455982a8a6f9bc11b5f3326ce3c6ef Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:01:54 -0700 Subject: kernel/exit.c: if init dies, log a signal which killed it, if any I just received another user's pleas for help when their init mysteriously died. I again explained that they need to check whether it died because of bad instruction, a segv, or something else. Which was an annoying detour into writing a trivial C program to spawn his init and print its exit code: http://lists.busybox.net/pipermail/busybox/2012-January/077172.html I hear you saying "just test it under /bin/sh". Well, the crashing init _was_ /bin/sh. Which prompted me to make kernel do this first step automatically. We can print exit code, which makes it possible to see that death was from e.g. SIGILL without writing test programs. [akpm@linux-foundation.org: add 0x to hex number output] Signed-off-by: Denys Vlasenko Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 456329fd4ea3..3db1909faed9 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father) if (unlikely(pid_ns->child_reaper == father)) { write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) - panic("Attempted to kill init!"); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: + father->exit_code); + } zap_pid_ns_processes(pid_ns); write_lock_irq(&tasklist_lock); -- cgit v1.2.3 From 7a05c0f7bbae91d08b7d0acf016fdb42dbc912ae Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 23 Mar 2012 15:01:55 -0700 Subject: watchdog: make sure the watchdog thread gets CPU on loaded system If the system is loaded while hotplugging a CPU we might end up with a bogus hardlockup detection. This has been seen during LTP pounder test executed in parallel with hotplug test. The main problem is that enable_watchdog (called when CPU is brought up) registers perf event which periodically checks per-cpu counter (hrtimer_interrupts), updated from a hrtimer callback, but the hrtimer is fired from the kernel thread. This means that while we already do check for the hard lockup the kernel thread might be sitting on the runqueue with zillions of tasks so there is nobody to update the value we rely on and so we KABOOM. Let's fix this by boosting the watchdog thread priority before we wake it up rather than when it's already running. This still doesn't handle a case where we have the same amount of high prio FIFO tasks but that doesn't seem to be common. The current implementation doesn't handle that case anyway so this is not worse at least. Unfortunately, we cannot start perf counter from the watchdog thread because we could miss a real lock up and also we cannot start the hrtimer watchdog_enable because we there is no way (at least I don't know any) to start a hrtimer from a different CPU. [dzickus@redhat.com: fix compile issue with param] Cc: Ingo Molnar Cc: Peter Zijlstra Reviewed-by: Mandeep Singh Baines Signed-off-by: Michal Hocko Signed-off-by: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14bc092fb12c..203fc6e1a285 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -319,11 +319,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = 0 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); - sched_setscheduler(current, SCHED_FIFO, ¶m); - /* initialize timestamp */ __touch_watchdog(); @@ -350,7 +348,6 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); - param.sched_priority = 0; sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -439,6 +436,7 @@ static int watchdog_enable(int cpu) /* create the watchdog thread */ if (!p) { + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); @@ -450,6 +448,7 @@ static int watchdog_enable(int cpu) } goto out; } + sched_setscheduler(p, SCHED_FIFO, ¶m); kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; per_cpu(softlockup_watchdog, cpu) = p; -- cgit v1.2.3 From 4501980aae221ed8120dee3491f799ecd75187ad Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Mar 2012 15:01:55 -0700 Subject: kernel/watchdog.c: convert to pr_foo() It fixes some 80-col wordwrappings and adds some consistency. Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 203fc6e1a285..a01cb03b045a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -9,6 +9,8 @@ * to those contributors as well. */ +#define pr_fmt(fmt) "NMI watchdog: " fmt + #include #include #include @@ -373,18 +375,20 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); if (!IS_ERR(event)) { - printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + pr_info("enabled, takes one hw-pmu counter.\n"); goto out_save; } /* vary the KERN level based on the returned errno */ if (PTR_ERR(event) == -EOPNOTSUPP) - printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + pr_warning("disabled (cpu%i): hardware events not enabled\n", + cpu); else - printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + pr_err("disabled (cpu%i): unable to create perf event: %ld\n", + cpu, PTR_ERR(event)); return PTR_ERR(event); /* success path */ @@ -439,7 +443,7 @@ static int watchdog_enable(int cpu) struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { - printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + pr_err("softlockup watchdog for %i failed\n", cpu); if (!err) { /* if hardlockup hasn't already set this */ err = PTR_ERR(p); @@ -495,7 +499,7 @@ static void watchdog_enable_all_cpus(void) watchdog_enabled = 1; if (!watchdog_enabled) - printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + pr_err("failed to be enabled on some cpus\n"); } -- cgit v1.2.3 From b60f796c4ca72545327a069f12938360d833cce7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 23 Mar 2012 15:01:56 -0700 Subject: kernel/watchdog.c: add comment to watchdog() exit path Revelation from Peter. Cc: Peter Zijlstra Cc: Don Zickus Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index a01cb03b045a..df30ee08bdd4 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -349,6 +349,10 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } + /* + * Drop the policy/priority elevation during thread exit to avoid a + * scheduling latency spike. + */ __set_current_state(TASK_RUNNING); sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; -- cgit v1.2.3 From 8c5cf9e5c50dc902713897e10201aa71f3546aa1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:40 -0700 Subject: ptrace: don't modify flags on PTRACE_SETOPTIONS failure On ptrace(PTRACE_SETOPTIONS, pid, 0, ), we used to set those option bits which are known, and then fail with -EINVAL if there are some unknown bits in . This is inconsistent with typical error handling, which does not change any state if input is invalid. This patch changes PTRACE_SETOPTIONS behavior so that in this case, we return -EINVAL and don't change any bits in task->ptrace. It's very unlikely that there is userspace code in the wild which will be affected by this change: it should have the form ptrace(PTRACE_SETOPTIONS, pid, 0, PTRACE_O_BOGUSOPT) where PTRACE_O_BOGUSOPT is a constant unknown to the kernel. But kernel headers, naturally, don't contain any PTRACE_O_BOGUSOPTs, thus the only way userspace can use one if it defines one itself. I can't see why anyone would do such a thing deliberately. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 00ab2ca5ed11..273f56ea39d2 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -528,6 +528,9 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + child->ptrace &= ~PT_TRACE_MASK; if (data & PTRACE_O_TRACESYSGOOD) @@ -551,7 +554,7 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data) if (data & PTRACE_O_TRACEEXIT) child->ptrace |= PT_TRACE_EXIT; - return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; + return 0; } static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) -- cgit v1.2.3 From 86b6c1f301faf085de5a3f9ce16b8de6e69c729b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:41 -0700 Subject: ptrace: simplify PTRACE_foo constants and PTRACE_SETOPTIONS code Exchange PT_TRACESYSGOOD and PT_PTRACE_CAP bit positions, which makes PT_option bits contiguous and therefore makes code in ptrace_setoptions() much simpler. Every PTRACE_O_TRACEevent is defined to (1 << PTRACE_EVENT_event) instead of using explicit numeric constants, to ensure we don't mess up relationship between bit positions and event ids. PT_EVENT_FLAG_SHIFT was not particularly useful, PT_OPT_FLAG_SHIFT with value of PT_EVENT_FLAG_SHIFT-1 is easier to use. PT_TRACE_MASK constant is nuked, the only its use is replaced by (PTRACE_O_MASK << PT_OPT_FLAG_SHIFT). Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Reviewed-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 33 +++++++++++++++------------------ kernel/ptrace.c | 31 ++++++++----------------------- 2 files changed, 23 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 6fdb196caa3e..6f1260ee5be5 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -54,17 +54,6 @@ /* flags in @data for PTRACE_SEIZE */ #define PTRACE_SEIZE_DEVEL 0x80000000 /* temp flag for development */ -/* options set using PTRACE_SETOPTIONS */ -#define PTRACE_O_TRACESYSGOOD 0x00000001 -#define PTRACE_O_TRACEFORK 0x00000002 -#define PTRACE_O_TRACEVFORK 0x00000004 -#define PTRACE_O_TRACECLONE 0x00000008 -#define PTRACE_O_TRACEEXEC 0x00000010 -#define PTRACE_O_TRACEVFORKDONE 0x00000020 -#define PTRACE_O_TRACEEXIT 0x00000040 - -#define PTRACE_O_MASK 0x0000007f - /* Wait extended result codes for the above trace options. */ #define PTRACE_EVENT_FORK 1 #define PTRACE_EVENT_VFORK 2 @@ -74,6 +63,17 @@ #define PTRACE_EVENT_EXIT 6 #define PTRACE_EVENT_STOP 7 +/* options set using PTRACE_SETOPTIONS */ +#define PTRACE_O_TRACESYSGOOD 1 +#define PTRACE_O_TRACEFORK (1 << PTRACE_EVENT_FORK) +#define PTRACE_O_TRACEVFORK (1 << PTRACE_EVENT_VFORK) +#define PTRACE_O_TRACECLONE (1 << PTRACE_EVENT_CLONE) +#define PTRACE_O_TRACEEXEC (1 << PTRACE_EVENT_EXEC) +#define PTRACE_O_TRACEVFORKDONE (1 << PTRACE_EVENT_VFORK_DONE) +#define PTRACE_O_TRACEEXIT (1 << PTRACE_EVENT_EXIT) + +#define PTRACE_O_MASK 0x0000007f + #include #ifdef __KERNEL__ @@ -88,13 +88,12 @@ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ -#define PT_TRACESYSGOOD 0x00000004 -#define PT_PTRACE_CAP 0x00000008 /* ptracer can follow suid-exec */ +#define PT_PTRACE_CAP 0x00000004 /* ptracer can follow suid-exec */ +#define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ -#define PT_EVENT_FLAG_SHIFT 4 -#define PT_EVENT_FLAG(event) (1 << (PT_EVENT_FLAG_SHIFT + (event) - 1)) - +#define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) +#define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) @@ -102,8 +101,6 @@ #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) -#define PT_TRACE_MASK 0x000003f4 - /* single stepping state bits (used on ARM and PA-RISC) */ #define PT_SINGLESTEP_BIT 31 #define PT_SINGLESTEP (1<ptrace &= ~PT_TRACE_MASK; - - if (data & PTRACE_O_TRACESYSGOOD) - child->ptrace |= PT_TRACESYSGOOD; - - if (data & PTRACE_O_TRACEFORK) - child->ptrace |= PT_TRACE_FORK; - - if (data & PTRACE_O_TRACEVFORK) - child->ptrace |= PT_TRACE_VFORK; - - if (data & PTRACE_O_TRACECLONE) - child->ptrace |= PT_TRACE_CLONE; - - if (data & PTRACE_O_TRACEEXEC) - child->ptrace |= PT_TRACE_EXEC; - - if (data & PTRACE_O_TRACEVFORKDONE) - child->ptrace |= PT_TRACE_VFORK_DONE; - - if (data & PTRACE_O_TRACEEXIT) - child->ptrace |= PT_TRACE_EXIT; + /* Avoid intermediate state when all opts are cleared */ + flags = child->ptrace; + flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); + flags |= (data << PT_OPT_FLAG_SHIFT); + child->ptrace = flags; return 0; } -- cgit v1.2.3 From aa9147c98f27550bd39416eca5a5844e54bced26 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:42 -0700 Subject: ptrace: make PTRACE_SEIZE set ptrace options specified in 'data' parameter This can be used to close a few corner cases in strace where we get unwanted racy behavior after attach, but before we have a chance to set options (the notorious post-execve SIGTRAP comes to mind), and removes the need to track "did we set opts for this task" state in strace internals. While we are at it: Make it possible to extend SEIZE in the future with more functionality by passing non-zero 'addr' parameter. To that end, error out if 'addr' is non-zero. PTRACE_ATTACH did not (and still does not) have such check, and users (strace) do pass garbage there... let's avoid repeating this mistake with SEIZE. Set all task->ptrace bits in one operation - before this change, we were adding PT_SEIZED and PT_PTRACE_CAP with task->ptrace |= BIT ops. This was probably ok (not a bug), but let's be on a safer side. Changes since v2: use (unsigned long) casts instead of (long) ones, move PTRACE_SEIZE_DEVEL-related code to separate lines of code. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Cc: Pedro Alves Reviewed-by: Oleg Nesterov Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 9acd07a6f5bb..4661c5bc07e5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -231,6 +231,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) } static int ptrace_attach(struct task_struct *task, long request, + unsigned long addr, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); @@ -238,19 +239,29 @@ static int ptrace_attach(struct task_struct *task, long request, /* * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL is used to prevent applications + * gradually. SEIZE_DEVEL bit is used to prevent applications * expecting full SEIZE behaviors trapping on kernel commits which * are still in the process of implementing them. * * Only test programs for new ptrace behaviors being implemented * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. * - * Once SEIZE behaviors are completely implemented, this flag and - * the following test will be removed. + * Once SEIZE behaviors are completely implemented, this flag + * will be removed. */ retval = -EIO; - if (seize && !(flags & PTRACE_SEIZE_DEVEL)) - goto out; + if (seize) { + if (addr != 0) + goto out; + if (!(flags & PTRACE_SEIZE_DEVEL)) + goto out; + flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL; + if (flags & ~(unsigned long)PTRACE_O_MASK) + goto out; + flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); + } else { + flags = PT_PTRACED; + } audit_ptrace(task); @@ -282,11 +293,11 @@ static int ptrace_attach(struct task_struct *task, long request, if (task->ptrace) goto unlock_tasklist; - task->ptrace = PT_PTRACED; if (seize) - task->ptrace |= PT_SEIZED; + flags |= PT_SEIZED; if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE)) - task->ptrace |= PT_PTRACE_CAP; + flags |= PT_PTRACE_CAP; + task->ptrace = flags; __ptrace_link(task, current); @@ -879,7 +890,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. @@ -1022,7 +1033,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, } if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) { - ret = ptrace_attach(child, request, data); + ret = ptrace_attach(child, request, addr, data); /* * Some architectures need to do book-keeping after * a ptrace attach. -- cgit v1.2.3 From ee00560c7dac1dbbf048446a8489550d0a5765b7 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 23 Mar 2012 15:02:43 -0700 Subject: ptrace: remove PTRACE_SEIZE_DEVEL bit PTRACE_SEIZE code is tested and ready for production use, remove the code which requires special bit in data argument to make PTRACE_SEIZE work. Strace team prepares for a new release of strace, and we would like to ship the code which uses PTRACE_SEIZE, preferably after this change goes into released kernel. Signed-off-by: Denys Vlasenko Acked-by: Tejun Heo Acked-by: Oleg Nesterov Cc: Pedro Alves Cc: Jan Kratochvil Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptrace.h | 5 +---- kernel/ptrace.c | 15 --------------- 2 files changed, 1 insertion(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 30be18064dfd..407c678d2e30 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -51,9 +51,6 @@ #define PTRACE_INTERRUPT 0x4207 #define PTRACE_LISTEN 0x4208 -/* flags in @data for PTRACE_SEIZE */ -#define PTRACE_SEIZE_DEVEL 0x80000000 /* temp flag for development */ - /* Wait extended result codes for the above trace options. */ #define PTRACE_EVENT_FORK 1 #define PTRACE_EVENT_VFORK 2 @@ -64,7 +61,7 @@ /* Extended result codes which enabled by means other than options. */ #define PTRACE_EVENT_STOP 128 -/* options set using PTRACE_SETOPTIONS */ +/* Options set using PTRACE_SETOPTIONS or using PTRACE_SEIZE @data param */ #define PTRACE_O_TRACESYSGOOD 1 #define PTRACE_O_TRACEFORK (1 << PTRACE_EVENT_FORK) #define PTRACE_O_TRACEVFORK (1 << PTRACE_EVENT_VFORK) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 4661c5bc07e5..ee8d49b9c309 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -237,25 +237,10 @@ static int ptrace_attach(struct task_struct *task, long request, bool seize = (request == PTRACE_SEIZE); int retval; - /* - * SEIZE will enable new ptrace behaviors which will be implemented - * gradually. SEIZE_DEVEL bit is used to prevent applications - * expecting full SEIZE behaviors trapping on kernel commits which - * are still in the process of implementing them. - * - * Only test programs for new ptrace behaviors being implemented - * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO. - * - * Once SEIZE behaviors are completely implemented, this flag - * will be removed. - */ retval = -EIO; if (seize) { if (addr != 0) goto out; - if (!(flags & PTRACE_SEIZE_DEVEL)) - goto out; - flags &= ~(unsigned long)PTRACE_SEIZE_DEVEL; if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); -- cgit v1.2.3 From 629d362b9950166c6fac2aa8425db34d824bb043 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:44 -0700 Subject: signal: give SEND_SIG_FORCED more power to beat SIGNAL_UNKILLABLE force_sig_info() and friends have the special semantics for synchronous signals, this interface should not be used if the target is not current. And it needs the fixes, in particular the clearing of SIGNAL_UNKILLABLE is not exactly right. However there are callers which have to use force_ exactly because it clears SIGNAL_UNKILLABLE and thus it can kill the CLONE_NEWPID tasks, although this is almost always is wrong by various reasons. With this patch SEND_SIG_FORCED ignores SIGNAL_UNKILLABLE, like we do if the signal comes from the ancestor namespace. This makes the naming in prepare_signal() paths insane, fixed by the next cleanup. Note: this only affects SIGKILL/SIGSTOP, but this is enough for force_sig() abusers. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index e76001ccf5cd..2584f5a91fbe 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, assert_spin_locked(&t->sighand->siglock); result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, from_ancestor_ns)) + if (!prepare_signal(sig, t, + from_ancestor_ns || (info == SEND_SIG_FORCED))) goto ret; pending = group ? &t->signal->shared_pending : &t->pending; -- cgit v1.2.3 From def8cf72562e17ec8316ce0cb5697c7afd6400e3 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:45 -0700 Subject: signal: cosmetic, s/from_ancestor_ns/force/ in prepare_signal() paths Cosmetic, rename the from_ancestor_ns argument in prepare_signal() paths. After the previous change it doesn't match the reality. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 2584f5a91fbe..d523da02dd14 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -58,21 +58,20 @@ static int sig_handler_ignored(void __user *handler, int sig) (handler == SIG_DFL && sig_kernel_ignore(sig)); } -static int sig_task_ignored(struct task_struct *t, int sig, - int from_ancestor_ns) +static int sig_task_ignored(struct task_struct *t, int sig, bool force) { void __user *handler; handler = sig_handler(t, sig); if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && - handler == SIG_DFL && !from_ancestor_ns) + handler == SIG_DFL && !force) return 1; return sig_handler_ignored(handler, sig); } -static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +static int sig_ignored(struct task_struct *t, int sig, bool force) { /* * Blocked signals are never ignored, since the @@ -82,7 +81,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) return 0; - if (!sig_task_ignored(t, sig, from_ancestor_ns)) + if (!sig_task_ignored(t, sig, force)) return 0; /* @@ -855,7 +854,7 @@ static void ptrace_trap_notify(struct task_struct *t) * Returns true if the signal should be actually delivered, otherwise * it should be dropped. */ -static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +static int prepare_signal(int sig, struct task_struct *p, bool force) { struct signal_struct *signal = p->signal; struct task_struct *t; @@ -915,7 +914,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } } - return !sig_ignored(p, sig, from_ancestor_ns); + return !sig_ignored(p, sig, force); } /* @@ -1602,7 +1601,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) ret = 1; /* the signal is ignored */ result = TRACE_SIGNAL_IGNORED; - if (!prepare_signal(sig, t, 0)) + if (!prepare_signal(sig, t, false)) goto out; ret = 0; -- cgit v1.2.3 From a02d6fd643cbd4c559113b35b31d3b04e4ec60c7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:46 -0700 Subject: signal: zap_pid_ns_processes: s/SEND_SIG_NOINFO/SEND_SIG_FORCED/ Change zap_pid_ns_processes() to use SEND_SIG_FORCED, it looks more clear compared to SEND_SIG_NOINFO which relies on from_ancestor_ns logic send_signal(). It is also more efficient if we need to kill a lot of tasks because it doesn't alloc sigqueue. While at it, add the __fatal_signal_pending(task) check as a minor optimization. Signed-off-by: Oleg Nesterov Cc: Tejun Heo Cc: Anton Vorontsov Cc: "Eric W. Biederman" Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a8968396046d..17b232869a04 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -168,13 +168,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) while (nr > 0) { rcu_read_lock(); - /* - * Any nested-container's init processes won't ignore the - * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). - */ task = pid_task(find_vpid(nr), PIDTYPE_PID); - if (task) - send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); + if (task && !__fatal_signal_pending(task)) + send_sig_info(SIGKILL, SEND_SIG_FORCED, task); rcu_read_unlock(); -- cgit v1.2.3 From b3449922502f5a161ee2b5022a33aec8472fbf18 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:47 -0700 Subject: usermodehelper: introduce umh_complete(sub_info) Preparation. Add the new trivial helper, umh_complete(). Currently it simply does complete(sub_info->complete). Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index a0a88543934e..8ea25944ce33 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -199,6 +199,11 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info) } EXPORT_SYMBOL(call_usermodehelper_freeinfo); +static void umh_complete(struct subprocess_info *sub_info) +{ + complete(sub_info->complete); +} + /* Keventd can't block, but this (a child) can. */ static int wait_for_helper(void *data) { @@ -235,7 +240,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - complete(sub_info->complete); + umh_complete(sub_info); return 0; } @@ -269,7 +274,7 @@ static void __call_usermodehelper(struct work_struct *work) case UMH_WAIT_EXEC: if (pid < 0) sub_info->retval = pid; - complete(sub_info->complete); + umh_complete(sub_info); } } -- cgit v1.2.3 From d0bd587a80960d7ba7e0c8396e154028c9045c54 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:47 -0700 Subject: usermodehelper: implement UMH_KILLABLE Implement UMH_KILLABLE, should be used along with UMH_WAIT_EXEC/PROC. The caller must ensure that subprocess_info->path/etc can not go away until call_usermodehelper_freeinfo(). call_usermodehelper_exec(UMH_KILLABLE) does wait_for_completion_killable. If it fails, it uses xchg(&sub_info->complete, NULL) to serialize with umh_complete() which does the same xhcg() to access sub_info->complete. If call_usermodehelper_exec wins, it can safely return. umh_complete() should get NULL and call call_usermodehelper_freeinfo(). Otherwise we know that umh_complete() was already called, in this case call_usermodehelper_exec() falls back to wait_for_completion() which should succeed "very soon". Note: UMH_NO_WAIT == -1 but it obviously should not be used with UMH_KILLABLE. We delay the neccessary cleanup to simplify the back porting. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmod.h | 2 ++ kernel/kmod.c | 27 +++++++++++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 722f477c4ef7..1b5985855ffc 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -54,6 +54,8 @@ enum umh_wait { UMH_WAIT_PROC = 1, /* wait for the process to complete */ }; +#define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ + struct subprocess_info { struct work_struct work; struct completion *complete; diff --git a/kernel/kmod.c b/kernel/kmod.c index 8ea25944ce33..f92f917c450c 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -201,7 +201,15 @@ EXPORT_SYMBOL(call_usermodehelper_freeinfo); static void umh_complete(struct subprocess_info *sub_info) { - complete(sub_info->complete); + struct completion *comp = xchg(&sub_info->complete, NULL); + /* + * See call_usermodehelper_exec(). If xchg() returns NULL + * we own sub_info, the UMH_KILLABLE caller has gone away. + */ + if (comp) + complete(comp); + else + call_usermodehelper_freeinfo(sub_info); } /* Keventd can't block, but this (a child) can. */ @@ -252,6 +260,9 @@ static void __call_usermodehelper(struct work_struct *work) enum umh_wait wait = sub_info->wait; pid_t pid; + if (wait != UMH_NO_WAIT) + wait &= ~UMH_KILLABLE; + /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -461,9 +472,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, queue_work(khelper_wq, &sub_info->work); if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; + + if (wait & UMH_KILLABLE) { + retval = wait_for_completion_killable(&done); + if (!retval) + goto wait_done; + + /* umh_complete() will see NULL and free sub_info */ + if (xchg(&sub_info->complete, NULL)) + goto unlock; + /* fallthrough, umh_complete() was already called */ + } + wait_for_completion(&done); +wait_done: retval = sub_info->retval; - out: call_usermodehelper_freeinfo(sub_info); unlock: -- cgit v1.2.3 From 9d944ef32e83405a07376f112e9f02161d3e9731 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:48 -0700 Subject: usermodehelper: kill umh_wait, renumber UMH_* constants No functional changes. It is not sane to use UMH_KILLABLE with enum umh_wait, but obviously we do not want another argument in call_usermodehelper_* helpers. Kill this enum, use the plain int. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmod.h | 18 +++++++----------- kernel/kmod.c | 8 ++------ security/keys/request_key.c | 2 +- 3 files changed, 10 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 1b5985855ffc..9efeae679106 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -48,12 +48,9 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; struct cred; struct file; -enum umh_wait { - UMH_NO_WAIT = -1, /* don't wait at all */ - UMH_WAIT_EXEC = 0, /* wait for the exec, but not the process */ - UMH_WAIT_PROC = 1, /* wait for the process to complete */ -}; - +#define UMH_NO_WAIT 0 /* don't wait at all */ +#define UMH_WAIT_EXEC 1 /* wait for the exec, but not the process */ +#define UMH_WAIT_PROC 2 /* wait for the process to complete */ #define UMH_KILLABLE 4 /* wait for EXEC/PROC killable */ struct subprocess_info { @@ -62,7 +59,7 @@ struct subprocess_info { char *path; char **argv; char **envp; - enum umh_wait wait; + int wait; int retval; int (*init)(struct subprocess_info *info, struct cred *new); void (*cleanup)(struct subprocess_info *info); @@ -80,15 +77,14 @@ void call_usermodehelper_setfns(struct subprocess_info *info, void *data); /* Actually execute the sub-process */ -int call_usermodehelper_exec(struct subprocess_info *info, enum umh_wait wait); +int call_usermodehelper_exec(struct subprocess_info *info, int wait); /* Free the subprocess_info. This is only needed if you're not going to call call_usermodehelper_exec */ void call_usermodehelper_freeinfo(struct subprocess_info *info); static inline int -call_usermodehelper_fns(char *path, char **argv, char **envp, - enum umh_wait wait, +call_usermodehelper_fns(char *path, char **argv, char **envp, int wait, int (*init)(struct subprocess_info *info, struct cred *new), void (*cleanup)(struct subprocess_info *), void *data) { @@ -106,7 +102,7 @@ call_usermodehelper_fns(char *path, char **argv, char **envp, } static inline int -call_usermodehelper(char *path, char **argv, char **envp, enum umh_wait wait) +call_usermodehelper(char *path, char **argv, char **envp, int wait) { return call_usermodehelper_fns(path, argv, envp, wait, NULL, NULL, NULL); diff --git a/kernel/kmod.c b/kernel/kmod.c index f92f917c450c..8341de91613f 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -257,12 +257,9 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - enum umh_wait wait = sub_info->wait; + int wait = sub_info->wait & ~UMH_KILLABLE; pid_t pid; - if (wait != UMH_NO_WAIT) - wait &= ~UMH_KILLABLE; - /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ @@ -451,8 +448,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns); * asynchronously if wait is not set, and runs as a child of keventd. * (ie. it runs with full root capabilities). */ -int call_usermodehelper_exec(struct subprocess_info *sub_info, - enum umh_wait wait) +int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { DECLARE_COMPLETION_ONSTACK(done); int retval = 0; diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 82465328c39b..cc3790315d2f 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -91,7 +91,7 @@ static void umh_keys_cleanup(struct subprocess_info *info) * Call a usermode helper with a specific session keyring. */ static int call_usermodehelper_keys(char *path, char **argv, char **envp, - struct key *session_keyring, enum umh_wait wait) + struct key *session_keyring, int wait) { gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL; struct subprocess_info *info = -- cgit v1.2.3 From 5b9bd473e3b8a8c6c4ae99be475e6e9b27568555 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:49 -0700 Subject: usermodehelper: ____call_usermodehelper() doesn't need do_exit() Minor cleanup. ____call_usermodehelper() can simply return, no need to call do_exit() explicitely. Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 8341de91613f..685b246b13b0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -188,7 +188,7 @@ static int ____call_usermodehelper(void *data) /* Exec failed? */ fail: sub_info->retval = retval; - do_exit(0); + return 0; } void call_usermodehelper_freeinfo(struct subprocess_info *info) -- cgit v1.2.3 From 3e63a93b987685f02421e18b2aa452d20553a88b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:49 -0700 Subject: kmod: introduce call_modprobe() helper No functional changes. Move the call_usermodehelper code from __request_module() into the new simple helper, call_modprobe(). Signed-off-by: Oleg Nesterov Cc: Tetsuo Handa Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 685b246b13b0..56a29e812ff0 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,21 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static int call_modprobe(char *module_name, int wait) +{ + static char *envp[] = { + "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + + char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + + return call_usermodehelper_fns(modprobe_path, argv, envp, + wait, NULL, NULL, NULL); +} + /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete @@ -81,11 +96,6 @@ int __request_module(bool wait, const char *fmt, ...) char module_name[MODULE_NAME_LEN]; unsigned int max_modprobes; int ret; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL }; static atomic_t kmod_concurrent = ATOMIC_INIT(0); #define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ static int kmod_loop_msg; @@ -128,9 +138,7 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper_fns(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, - NULL, NULL, NULL); + ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_dec(&kmod_concurrent); return ret; -- cgit v1.2.3 From 1cc684ab75123efe7ff446eb821d44375ba8fa30 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 23 Mar 2012 15:02:50 -0700 Subject: kmod: make __request_module() killable As Tetsuo Handa pointed out, request_module() can stress the system while the oom-killed caller sleeps in TASK_UNINTERRUPTIBLE. The task T uses "almost all" memory, then it does something which triggers request_module(). Say, it can simply call sys_socket(). This in turn needs more memory and leads to OOM. oom-killer correctly chooses T and kills it, but this can't help because it sleeps in TASK_UNINTERRUPTIBLE and after that oom-killer becomes "disabled" by the TIF_MEMDIE task T. Make __request_module() killable. The only necessary change is that call_modprobe() should kmalloc argv and module_name, they can't live in the stack if we use UMH_KILLABLE. This memory is freed via call_usermodehelper_freeinfo()->cleanup. Reported-by: Tetsuo Handa Signed-off-by: Oleg Nesterov Cc: Rusty Russell Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index 56a29e812ff0..957a7aab8ebc 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -60,6 +60,12 @@ static DECLARE_RWSEM(umhelper_sem); */ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; +static void free_modprobe_argv(struct subprocess_info *info) +{ + kfree(info->argv[3]); /* check call_modprobe() */ + kfree(info->argv); +} + static int call_modprobe(char *module_name, int wait) { static char *envp[] = { @@ -69,10 +75,26 @@ static int call_modprobe(char *module_name, int wait) NULL }; - char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); + if (!argv) + goto out; + + module_name = kstrdup(module_name, GFP_KERNEL); + if (!module_name) + goto free_argv; + + argv[0] = modprobe_path; + argv[1] = "-q"; + argv[2] = "--"; + argv[3] = module_name; /* check free_modprobe_argv() */ + argv[4] = NULL; return call_usermodehelper_fns(modprobe_path, argv, envp, - wait, NULL, NULL, NULL); + wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL); +free_argv: + kfree(argv); +out: + return -ENOMEM; } /** -- cgit v1.2.3 From b01c3a0010aabadf745f3e7fdb9cab682e0a28a2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 23 Mar 2012 15:41:20 +0100 Subject: perf: Move mmap page data_head offset assertion out of header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Having the build time assertion in header is making the perf build fail on x86 with: ../../include/linux/perf_event.h:411:32: error: variably modified \ ‘__assert_mmap_data_head_offset’ at file scope [-Werror] I'm moving the build time validation out of the header, because I think it's better than to lessen the perf build warn/error check. Signed-off-by: Jiri Olsa Cc: acme@redhat.com Cc: a.p.zijlstra@chello.nl Cc: paulus@samba.org Cc: cjashfor@linux.vnet.ibm.com Cc: fweisbec@gmail.com Link: http://lkml.kernel.org/r/1332513680-7870-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 7 ------- kernel/events/core.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ca9ed4e6a286..ddbb6a901f65 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -403,13 +403,6 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ }; -/* - * Build time assertion that we keep the data_head at the intended location. - * IOW, validation we got the __reserved[] size right. - */ -extern char __assert_mmap_data_head_offset - [1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)]; - #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index dc3b05272511..3f92a19aa11e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7116,6 +7116,13 @@ void __init perf_event_init(void) /* do not patch jump label more than once per second */ jump_label_rate_limit(&perf_sched_events, HZ); + + /* + * Build time assertion that we keep the data_head at the intended + * location. IOW, validation we got the __reserved[] size right. + */ + BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head)) + != 1024); } static int __init perf_event_sysfs_init(void) -- cgit v1.2.3 From c5e14e763046b11dd8bf57b5dc9f3ab444af8e60 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 24 Mar 2012 12:46:23 +0100 Subject: alarmtimer: Don't call rtc_timer_init() when CONFIG_RTC_CLASS=n rtc_timer_init() is not available when CONFIG_RTC_CLASS=n. Provide a proper wrapper in the RTC section of alarmtimer.c Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: John Stultz --- kernel/time/alarmtimer.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index c16548807f1e..8a538c55fc7b 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -46,10 +46,9 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); -static struct rtc_timer rtctimer; - #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; static struct rtc_device *rtcdev; static DEFINE_SPINLOCK(rtcdev_lock); @@ -97,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev, return 0; } +static inline void alarmtimer_rtc_timer_init(void) +{ + rtc_timer_init(&rtctimer, NULL, NULL); +} + static struct class_interface alarmtimer_rtc_interface = { .add_dev = &alarmtimer_rtc_add_device, }; @@ -118,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void) #define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } +static inline void alarmtimer_rtc_timer_init(void) { } #endif /** @@ -784,7 +789,7 @@ static int __init alarmtimer_init(void) .nsleep = alarm_timer_nsleep, }; - rtc_timer_init(&rtctimer, NULL, NULL); + alarmtimer_rtc_timer_init(); posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); -- cgit v1.2.3 From 02608bef8f774c058779546926889a2f2717a499 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 1 Feb 2012 10:33:14 +0800 Subject: module: add kernel param to force disable module load Sometimes we need to test a kernel of same version with code or config option changes. We already have sysctl to disable module load, but add a kernel parameter will be more convenient. Since modules_disabled is int, so here use bint type in core_param. TODO: make sysctl accept bool and change modules_disabled to bool Signed-off-by: Dave Young Signed-off-by: Rusty Russell --- Documentation/kernel-parameters.txt | 2 ++ kernel/module.c | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 58eac231fe69..e2f8c297a8a4 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1869,6 +1869,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted. shutdown the other cpus. Instead use the REBOOT_VECTOR irq. + nomodule Disable module load + nopat [X86] Disable PAT (page attribute table extension of pagetables) support. diff --git a/kernel/module.c b/kernel/module.c index 2c932760fd33..7e31da9750c0 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ /* Block module loading/unloading? */ int modules_disabled = 0; +core_param(nomodule, modules_disabled, bint, 0); /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); -- cgit v1.2.3 From 8b8252813dee8e8cd453bb219731c36b268c69a7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 26 Mar 2012 12:50:51 +1030 Subject: module_param: remove support for bool parameters which are really int. module_param(bool) used to counter-intuitively take an int. In fddd5201 (mid-2009) we allowed bool or int/unsigned int using a messy trick. This eliminates that code (though leaves the flags field in the struct, for impending use). Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 19 ++++++------------- kernel/params.c | 23 ++--------------------- 2 files changed, 8 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index c47f4d60db0b..4daa895648d9 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -47,9 +47,6 @@ struct kernel_param_ops { void (*free)(void *arg); }; -/* Flag bits for kernel_param.flags */ -#define KPARAM_ISBOOL 2 - struct kernel_param { const char *name; const struct kernel_param_ops *ops; @@ -131,8 +128,7 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define module_param_cb(name, ops, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, \ - name, ops, arg, __same_type((arg), bool *), perm) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm) /* On alpha, ia64 and ppc64 relocations to global data cannot go into read-only sections (which is part of respective UNIX ABI on these @@ -146,7 +142,7 @@ struct kparam_array /* This is the fundamental function for registering boot/module parameters. */ -#define __module_param_call(prefix, name, ops, arg, isbool, perm) \ +#define __module_param_call(prefix, name, ops, arg, perm) \ /* Default value instead of permissions? */ \ static int __param_perm_check_##name __attribute__((unused)) = \ BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2)) \ @@ -155,8 +151,7 @@ struct kparam_array static struct kernel_param __moduleparam_const __param_##name \ __used \ __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ - = { __param_str_##name, ops, perm, isbool ? KPARAM_ISBOOL : 0, \ - { arg } } + = { __param_str_##name, ops, perm, 0, { arg } } /* Obsolete - use module_param_cb() */ #define module_param_call(name, set, get, arg, perm) \ @@ -164,7 +159,6 @@ struct kparam_array { (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ - __same_type(arg, bool *), \ (perm) + sizeof(__check_old_set_param(set))*0) /* We don't get oldget: it's often a new-style param_get_uint, etc. */ @@ -245,8 +239,7 @@ static inline void __kernel_param_unlock(void) */ #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ - __module_param_call("", name, ¶m_ops_##type, \ - &var, __same_type(var, bool), perm) + __module_param_call("", name, ¶m_ops_##type, &var, perm) #endif /* !MODULE */ /** @@ -264,7 +257,7 @@ static inline void __kernel_param_unlock(void) = { len, string }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_ops_string, \ - .str = &__param_string_##name, 0, perm); \ + .str = &__param_string_##name, perm); \ __MODULE_PARM_TYPE(name, "string") /** @@ -403,7 +396,7 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ - __same_type(array[0], bool), perm); \ + perm); \ __MODULE_PARM_TYPE(name, "array of " #type) extern struct kernel_param_ops param_array_ops; diff --git a/kernel/params.c b/kernel/params.c index 47f5bf12434a..508828afb874 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -297,35 +297,18 @@ EXPORT_SYMBOL(param_ops_charp); /* Actually could be a bool or an int, for historical reasons. */ int param_set_bool(const char *val, const struct kernel_param *kp) { - bool v; - int ret; - /* No equals means "set"... */ if (!val) val = "1"; /* One of =[yYnN01] */ - ret = strtobool(val, &v); - if (ret) - return ret; - - if (kp->flags & KPARAM_ISBOOL) - *(bool *)kp->arg = v; - else - *(int *)kp->arg = v; - return 0; + return strtobool(val, kp->arg); } EXPORT_SYMBOL(param_set_bool); int param_get_bool(char *buffer, const struct kernel_param *kp) { - bool val; - if (kp->flags & KPARAM_ISBOOL) - val = *(bool *)kp->arg; - else - val = *(int *)kp->arg; - /* Y and N chosen as being relatively non-coder friendly */ - return sprintf(buffer, "%c", val ? 'Y' : 'N'); + return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N'); } EXPORT_SYMBOL(param_get_bool); @@ -343,7 +326,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp) struct kernel_param dummy; dummy.arg = &boolval; - dummy.flags = KPARAM_ISBOOL; ret = param_set_bool(val, &dummy); if (ret == 0) *(bool *)kp->arg = !boolval; @@ -372,7 +354,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp) /* Match bool exactly, by re-using it. */ boolkp = *kp; boolkp.arg = &v; - boolkp.flags |= KPARAM_ISBOOL; ret = param_set_bool(val, &boolkp); if (ret == 0) -- cgit v1.2.3 From 026cee0086fe1df4cf74691cf273062cc769617d Mon Sep 17 00:00:00 2001 From: Pawel Moll Date: Mon, 26 Mar 2012 12:50:51 +1030 Subject: params: _initcall-like kernel parameters This patch adds a set of macros that can be used to declare kernel parameters to be parsed _before_ initcalls at a chosen level are executed. We rename the now-unused "flags" field of struct kernel_param as the level. It's signed, for when we use this for early params as well, in future. Linker macro collating init calls had to be modified in order to add additional symbols between levels that are later used by the init code to split the calls into blocks. Signed-off-by: Pawel Moll Signed-off-by: Rusty Russell --- arch/powerpc/mm/hugetlbpage.c | 3 +- include/asm-generic/vmlinux.lds.h | 35 +++++++++------------ include/linux/moduleparam.h | 51 +++++++++++++++++++++++++----- init/main.c | 65 +++++++++++++++++++++++++++++++++++---- kernel/module.c | 3 +- kernel/params.c | 16 +++++++--- 6 files changed, 132 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 57c7465e656e..a3e628727697 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -310,7 +310,8 @@ void __init reserve_hugetlb_gpages(void) int i; strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup); + parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, + &do_gpage_early_setup); /* * Walk gpage list in reverse, allocating larger page sizes first. diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 798603e8ec38..8aeadf6b553a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -616,30 +616,23 @@ *(.init.setup) \ VMLINUX_SYMBOL(__setup_end) = .; -#define INITCALLS \ - *(.initcallearly.init) \ - VMLINUX_SYMBOL(__early_initcall_end) = .; \ - *(.initcall0.init) \ - *(.initcall0s.init) \ - *(.initcall1.init) \ - *(.initcall1s.init) \ - *(.initcall2.init) \ - *(.initcall2s.init) \ - *(.initcall3.init) \ - *(.initcall3s.init) \ - *(.initcall4.init) \ - *(.initcall4s.init) \ - *(.initcall5.init) \ - *(.initcall5s.init) \ - *(.initcallrootfs.init) \ - *(.initcall6.init) \ - *(.initcall6s.init) \ - *(.initcall7.init) \ - *(.initcall7s.init) +#define INIT_CALLS_LEVEL(level) \ + VMLINUX_SYMBOL(__initcall##level##_start) = .; \ + *(.initcall##level##.init) \ + *(.initcall##level##s.init) \ #define INIT_CALLS \ VMLINUX_SYMBOL(__initcall_start) = .; \ - INITCALLS \ + *(.initcallearly.init) \ + INIT_CALLS_LEVEL(0) \ + INIT_CALLS_LEVEL(1) \ + INIT_CALLS_LEVEL(2) \ + INIT_CALLS_LEVEL(3) \ + INIT_CALLS_LEVEL(4) \ + INIT_CALLS_LEVEL(5) \ + INIT_CALLS_LEVEL(rootfs) \ + INIT_CALLS_LEVEL(6) \ + INIT_CALLS_LEVEL(7) \ VMLINUX_SYMBOL(__initcall_end) = .; #define CON_INITCALL \ diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 4daa895648d9..ea36486378d8 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -51,7 +51,7 @@ struct kernel_param { const char *name; const struct kernel_param_ops *ops; u16 perm; - u16 flags; + s16 level; union { void *arg; const struct kparam_string *str; @@ -128,7 +128,40 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define module_param_cb(name, ops, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, 0) + +/** + * _param_cb - general callback for a module/cmdline parameter + * to be evaluated before certain initcall level + * @name: a valid C identifier which is the parameter name. + * @ops: the set & get operations for this parameter. + * @perm: visibility in sysfs. + * + * The ops can have NULL set or get functions. + */ +#define __level_param_cb(name, ops, arg, perm, level) \ + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, level) + +#define core_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 1) + +#define postcore_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 2) + +#define arch_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 3) + +#define subsys_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 4) + +#define fs_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 5) + +#define device_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 6) + +#define late_param_cb(name, ops, arg, perm) \ + __level_param_cb(name, ops, arg, perm, 7) /* On alpha, ia64 and ppc64 relocations to global data cannot go into read-only sections (which is part of respective UNIX ABI on these @@ -142,7 +175,7 @@ struct kparam_array /* This is the fundamental function for registering boot/module parameters. */ -#define __module_param_call(prefix, name, ops, arg, perm) \ +#define __module_param_call(prefix, name, ops, arg, perm, level) \ /* Default value instead of permissions? */ \ static int __param_perm_check_##name __attribute__((unused)) = \ BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2)) \ @@ -151,7 +184,7 @@ struct kparam_array static struct kernel_param __moduleparam_const __param_##name \ __used \ __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \ - = { __param_str_##name, ops, perm, 0, { arg } } + = { __param_str_##name, ops, perm, level, { arg } } /* Obsolete - use module_param_cb() */ #define module_param_call(name, set, get, arg, perm) \ @@ -159,7 +192,7 @@ struct kparam_array { (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ - (perm) + sizeof(__check_old_set_param(set))*0) + (perm) + sizeof(__check_old_set_param(set))*0, 0) /* We don't get oldget: it's often a new-style param_get_uint, etc. */ static inline int @@ -239,7 +272,7 @@ static inline void __kernel_param_unlock(void) */ #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ - __module_param_call("", name, ¶m_ops_##type, &var, perm) + __module_param_call("", name, ¶m_ops_##type, &var, perm, 0) #endif /* !MODULE */ /** @@ -257,7 +290,7 @@ static inline void __kernel_param_unlock(void) = { len, string }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_ops_string, \ - .str = &__param_string_##name, perm); \ + .str = &__param_string_##name, perm, 0); \ __MODULE_PARM_TYPE(name, "string") /** @@ -285,6 +318,8 @@ extern int parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, + s16 level_min, + s16 level_max, int (*unknown)(char *param, char *val)); /* Called by module remove. */ @@ -396,7 +431,7 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ - perm); \ + perm, 0); \ __MODULE_PARM_TYPE(name, "array of " #type) extern struct kernel_param_ops param_array_ops; diff --git a/init/main.c b/init/main.c index c24805c824b9..439715858ba0 100644 --- a/init/main.c +++ b/init/main.c @@ -400,7 +400,7 @@ static int __init do_early_param(char *param, char *val) void __init parse_early_options(char *cmdline) { - parse_args("early options", cmdline, NULL, 0, do_early_param); + parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param); } /* Arch code calls this early on, or if not, just before other parsing. */ @@ -503,7 +503,7 @@ asmlinkage void __init start_kernel(void) parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, - &unknown_bootoption); + 0, 0, &unknown_bootoption); jump_label_init(); @@ -699,16 +699,69 @@ int __init_or_module do_one_initcall(initcall_t fn) } -extern initcall_t __initcall_start[], __initcall_end[], __early_initcall_end[]; +extern initcall_t __initcall_start[]; +extern initcall_t __initcall0_start[]; +extern initcall_t __initcall1_start[]; +extern initcall_t __initcall2_start[]; +extern initcall_t __initcall3_start[]; +extern initcall_t __initcall4_start[]; +extern initcall_t __initcall5_start[]; +extern initcall_t __initcall6_start[]; +extern initcall_t __initcall7_start[]; +extern initcall_t __initcall_end[]; + +static initcall_t *initcall_levels[] __initdata = { + __initcall0_start, + __initcall1_start, + __initcall2_start, + __initcall3_start, + __initcall4_start, + __initcall5_start, + __initcall6_start, + __initcall7_start, + __initcall_end, +}; + +static char *initcall_level_names[] __initdata = { + "early parameters", + "core parameters", + "postcore parameters", + "arch parameters", + "subsys parameters", + "fs parameters", + "device parameters", + "late parameters", +}; + +static int __init ignore_unknown_bootoption(char *param, char *val) +{ + return 0; +} -static void __init do_initcalls(void) +static void __init do_initcall_level(int level) { + extern const struct kernel_param __start___param[], __stop___param[]; initcall_t *fn; - for (fn = __early_initcall_end; fn < __initcall_end; fn++) + strcpy(static_command_line, saved_command_line); + parse_args(initcall_level_names[level], + static_command_line, __start___param, + __stop___param - __start___param, + level, level, + ignore_unknown_bootoption); + + for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++) do_one_initcall(*fn); } +static void __init do_initcalls(void) +{ + int level; + + for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++) + do_initcall_level(level); +} + /* * Ok, the machine is now initialized. None of the devices * have been touched yet, but the CPU subsystem is up and @@ -732,7 +785,7 @@ static void __init do_pre_smp_initcalls(void) { initcall_t *fn; - for (fn = __initcall_start; fn < __early_initcall_end; fn++) + for (fn = __initcall_start; fn < __initcall0_start; fn++) do_one_initcall(*fn); } diff --git a/kernel/module.c b/kernel/module.c index 7e31da9750c0..6f6651a54590 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2923,7 +2923,8 @@ static struct module *load_module(void __user *umod, mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ - err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); + err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, + -32768, 32767, NULL); if (err < 0) goto unlink; diff --git a/kernel/params.c b/kernel/params.c index 508828afb874..f37d82631347 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -87,6 +87,8 @@ static int parse_one(char *param, char *val, const struct kernel_param *params, unsigned num_params, + s16 min_level, + s16 max_level, int (*handle_unknown)(char *param, char *val)) { unsigned int i; @@ -95,6 +97,9 @@ static int parse_one(char *param, /* Find parameter */ for (i = 0; i < num_params; i++) { if (parameq(param, params[i].name)) { + if (params[i].level < min_level + || params[i].level > max_level) + return 0; /* No one handled NULL, so do it here. */ if (!val && params[i].ops->set != param_set_bool && params[i].ops->set != param_set_bint) @@ -174,6 +179,8 @@ int parse_args(const char *name, char *args, const struct kernel_param *params, unsigned num, + s16 min_level, + s16 max_level, int (*unknown)(char *param, char *val)) { char *param, *val; @@ -189,7 +196,8 @@ int parse_args(const char *name, args = next_arg(args, ¶m, &val); irq_was_disabled = irqs_disabled(); - ret = parse_one(param, val, params, num, unknown); + ret = parse_one(param, val, params, num, + min_level, max_level, unknown); if (irq_was_disabled && !irqs_disabled()) { printk(KERN_WARNING "parse_args(): option '%s' enabled " "irq's!\n", param); @@ -374,7 +382,7 @@ static int param_array(const char *name, unsigned int min, unsigned int max, void *elem, int elemsize, int (*set)(const char *, const struct kernel_param *kp), - u16 flags, + s16 level, unsigned int *num) { int ret; @@ -384,7 +392,7 @@ static int param_array(const char *name, /* Get the name right for errors. */ kp.name = name; kp.arg = elem; - kp.flags = flags; + kp.level = level; *num = 0; /* We expect a comma-separated list of values. */ @@ -425,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp) unsigned int temp_num; return param_array(kp->name, val, 1, arr->max, arr->elem, - arr->elemsize, arr->ops->set, kp->flags, + arr->elemsize, arr->ops->set, kp->level, arr->num ?: &temp_num); } -- cgit v1.2.3 From d53799be6758841e1ffb1fd3780f73d0ffe44432 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 26 Mar 2012 12:50:52 +1030 Subject: module: move __module_get and try_module_get() out of line. With the preempt, tracepoint and everything, it's getting a bit chubby. For an Ubuntu-based config: Before: $ size -t `find * -name '*.ko'` | grep TOTAL 56199906 3870760 1606616 61677282 3ad1ee2 (TOTALS) $ size vmlinux text data bss dec hex filename 8509342 850368 3358720 12718430 c2115e vmlinux After: $ size -t `find * -name '*.ko'` | grep TOTAL 56183760 3867892 1606616 61658268 3acd49c (TOTALS) $ size vmlinux text data bss dec hex filename 8501842 849088 3358720 12709650 c1ef12 vmlinux Signed-off-by: Steven Rostedt Acked-by: Ingo Molnar Signed-off-by: Rusty Russell (made all out-of-line) --- include/linux/module.h | 32 ++++---------------------------- kernel/module.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 4598bf03e98b..fbcafe2ee13e 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -21,8 +21,6 @@ #include #include -#include - /* Not Yet Implemented */ #define MODULE_SUPPORTED_DEVICE(name) @@ -452,33 +450,11 @@ void symbol_put_addr(void *addr); /* Sometimes we know we already have a refcount, and it's easier not to handle the error case (which only happens with rmmod --wait). */ -static inline void __module_get(struct module *module) -{ - if (module) { - preempt_disable(); - __this_cpu_inc(module->refptr->incs); - trace_module_get(module, _THIS_IP_); - preempt_enable(); - } -} - -static inline int try_module_get(struct module *module) -{ - int ret = 1; - - if (module) { - preempt_disable(); +extern void __module_get(struct module *module); - if (likely(module_is_live(module))) { - __this_cpu_inc(module->refptr->incs); - trace_module_get(module, _THIS_IP_); - } else - ret = 0; - - preempt_enable(); - } - return ret; -} +/* This is the Right Way to get a module: if it fails, it's being removed, + * so pretend it's not there. */ +extern bool try_module_get(struct module *module); extern void module_put(struct module *module); diff --git a/kernel/module.c b/kernel/module.c index 6f6651a54590..294692d8fcd8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -904,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr, static struct module_attribute modinfo_refcnt = __ATTR(refcnt, 0444, show_refcnt, NULL); +void __module_get(struct module *module) +{ + if (module) { + preempt_disable(); + __this_cpu_inc(module->refptr->incs); + trace_module_get(module, _RET_IP_); + preempt_enable(); + } +} +EXPORT_SYMBOL(__module_get); + +bool try_module_get(struct module *module) +{ + bool ret = true; + + if (module) { + preempt_disable(); + + if (likely(module_is_live(module))) { + __this_cpu_inc(module->refptr->incs); + trace_module_get(module, _RET_IP_); + } else + ret = false; + + preempt_enable(); + } + return ret; +} +EXPORT_SYMBOL(try_module_get); + void module_put(struct module *module) { if (module) { -- cgit v1.2.3 From f946eeb9313ff1470758e171a60fe7438a2ded3f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Mon, 30 Jan 2012 23:07:22 -0500 Subject: module: Remove module size limit Module size was limited to 64MB, this was legacy limitation due to vmalloc() which was removed a while ago. Limiting module size to 64MB is both pointless and affects real world use cases. Cc: Tim Abbott Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin Signed-off-by: Rusty Russell --- kernel/module.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 294692d8fcd8..78ac6ec1e425 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2411,8 +2411,7 @@ static int copy_and_check(struct load_info *info, return -ENOEXEC; /* Suck in entire file: we'll want most of it. */ - /* vmalloc barfs on "unusual" numbers. Check here */ - if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL) + if ((hdr = vmalloc(len)) == NULL) return -ENOMEM; if (copy_from_user(hdr, umod, len) != 0) { -- cgit v1.2.3 From 2baab4e90495ebc9826c93f79d74d6e60a828d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 20 Mar 2012 15:57:01 +0100 Subject: sched: Fix select_fallback_rq() vs cpu_active/cpu_online Commit 5fbd036b55 ("sched: Cleanup cpu_active madness"), which was supposed to finally sort the cpu_active mess, instead uncovered more. Since CPU_STARTING is ran before setting the cpu online, there's a (small) window where the cpu has active,!online. If during this time there's a wakeup of a task that used to reside on that cpu select_task_rq() will use select_fallback_rq() to compute an alternative cpu to run on since we find !online. select_fallback_rq() however will compute the new cpu against cpu_active, this means that it can return the same cpu it started out with, the !online one, since that cpu is in fact marked active. This results in us trying to scheduling a task on an offline cpu and triggering a WARN in the IPI code. The solution proposed by Chuansheng Liu of setting cpu_active in set_cpu_online() is buggy, firstly not all archs actually use set_cpu_online(), secondly, not all archs call set_cpu_online() with IRQs disabled, this means we would introduce either the same race or the race from fd8a7de17 ("x86: cpu-hotplug: Prevent softirq wakeup on wrong CPU") -- albeit much narrower. [ By setting online first and active later we have a window of online,!active, fresh and bound kthreads have task_cpu() of 0 and since cpu0 isn't in tsk_cpus_allowed() we end up in select_fallback_rq() which excludes !active, resulting in a reset of ->cpus_allowed and the thread running all over the place. ] The solution is to re-work select_fallback_rq() to require active _and_ online. This makes the active,!online case work as expected, OTOH archs running CPU_STARTING after setting online are now vulnerable to the issue from fd8a7de17 -- these are alpha and blackfin. Reported-by: Chuansheng Liu Signed-off-by: Peter Zijlstra Cc: Mike Frysinger Cc: linux-alpha@vger.kernel.org Link: http://lkml.kernel.org/n/tip-hubqk1i10o4dpvlm06gq7v6j@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 6 ++--- kernel/cpuset.c | 20 ++++------------ kernel/sched/core.c | 62 +++++++++++++++++++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e9eaec522655..e0ffaf061ab7 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -22,7 +22,7 @@ extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); -extern int cpuset_cpus_allowed_fallback(struct task_struct *p); +extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); @@ -144,10 +144,8 @@ static inline void cpuset_cpus_allowed(struct task_struct *p, cpumask_copy(mask, cpu_possible_mask); } -static inline int cpuset_cpus_allowed_fallback(struct task_struct *p) +static inline void cpuset_cpus_allowed_fallback(struct task_struct *p) { - do_set_cpus_allowed(p, cpu_possible_mask); - return cpumask_any(cpu_active_mask); } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a09ac2b9a661..c9837b74ab96 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2195,7 +2195,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) mutex_unlock(&callback_mutex); } -int cpuset_cpus_allowed_fallback(struct task_struct *tsk) +void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpuset *cs; int cpu; @@ -2219,22 +2219,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk) * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary * set any mask even if it is not right from task_cs() pov, * the pending set_cpus_allowed_ptr() will fix things. + * + * select_fallback_rq() will fix things ups and set cpu_possible_mask + * if required. */ - - cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); - if (cpu >= nr_cpu_ids) { - /* - * Either tsk->cpus_allowed is wrong (see above) or it - * is actually empty. The latter case is only possible - * if we are racing with remove_tasks_in_empty_cpuset(). - * Like above we can temporary set any mask and rely on - * set_cpus_allowed_ptr() as synchronization point. - */ - do_set_cpus_allowed(tsk, cpu_possible_mask); - cpu = cpumask_any(cpu_active_mask); - } - - return cpu; } void cpuset_init_current_mems_allowed(void) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e3ccc13c4caa..9c1629c90b2d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1263,29 +1263,59 @@ EXPORT_SYMBOL_GPL(kick_process); */ static int select_fallback_rq(int cpu, struct task_struct *p) { - int dest_cpu; const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + enum { cpuset, possible, fail } state = cpuset; + int dest_cpu; /* Look for allowed, online CPU in same node. */ - for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + for_each_cpu_mask(dest_cpu, *nodemask) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) return dest_cpu; + } - /* Any allowed, online CPU? */ - dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask); - if (dest_cpu < nr_cpu_ids) - return dest_cpu; + for (;;) { + /* Any allowed, online CPU? */ + for_each_cpu_mask(dest_cpu, *tsk_cpus_allowed(p)) { + if (!cpu_online(dest_cpu)) + continue; + if (!cpu_active(dest_cpu)) + continue; + goto out; + } - /* No more Mr. Nice Guy. */ - dest_cpu = cpuset_cpus_allowed_fallback(p); - /* - * Don't tell them about moving exiting tasks or - * kernel threads (both mm NULL), since they never - * leave kernel. - */ - if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", - task_pid_nr(p), p->comm, cpu); + switch (state) { + case cpuset: + /* No more Mr. Nice Guy. */ + cpuset_cpus_allowed_fallback(p); + state = possible; + break; + + case possible: + do_set_cpus_allowed(p, cpu_possible_mask); + state = fail; + break; + + case fail: + BUG(); + break; + } + } + +out: + if (state != cpuset) { + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk_sched("process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } } return dest_cpu; -- cgit v1.2.3 From 1b028abc779b67b699daff55e27d2432f8d92666 Mon Sep 17 00:00:00 2001 From: Michael J Wang Date: Mon, 19 Mar 2012 22:26:19 +0000 Subject: sched/rt: Improve pick_next_highest_task_rt() Avoid extra work by continuing on to the next rt_rq if the highest prio task in current rt_rq is the same priority as our candidate task. More detailed explanation: if next is not NULL, then we have found a candidate task, and its priority is next->prio. Now we are looking for an even higher priority task in the other rt_rq's. idx is the highest priority in the current candidate rt_rq. In the current 3.3 code, if idx is equal to next->prio, we would start scanning the tasks in that rt_rq and replace the current candidate task with a task from that rt_rq. But the new task would only have a priority that is equal to our previous candidate task, so we have not advanced our goal of finding a higher prio task. So we should avoid the extra work by continuing on to the next rt_rq if idx is equal to next->prio. Signed-off-by: Michael J Wang Acked-by: Steven Rostedt Reviewed-by: Yong Zhang Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/2EF88150C0EF2C43A218742ED384C1BC0FC83D6B@IRVEXCHMB08.corp.ad.broadcom.com Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index b60dad720173..44af55e6d5d0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1428,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) next_idx: if (idx >= MAX_RT_PRIO) continue; - if (next && next->prio < idx) + if (next && next->prio <= idx) continue; list_for_each_entry(rt_se, array->queue + idx, run_list) { struct task_struct *p; -- cgit v1.2.3 From 12b5da349a8b94c9dbc3430a6bc42eabd9eaf50b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 27 Mar 2012 10:43:28 -0400 Subject: tracing: Fix ent_size in trace output When reading the trace file, the records of each of the per_cpu buffers are examined to find the next event to print out. At the point of looking at the event, the size of the event is recorded. But if the first event is chosen, the other events in the other CPU buffers will reset the event size that is stored in the iterator descriptor, causing the event size passed to the output functions to be incorrect. In most cases this is not a problem, but for the case of stack traces, it is. With the change to the stack tracing to record a dynamic number of back traces, the output depends on the size of the entry instead of the fixed 8 back traces. When the entry size is not correct, the back traces would not be fully printed. Note, reading from the per-cpu trace files were not affected. Reported-by: Thomas Gleixner Tested-by: Thomas Gleixner Cc: stable@vger.kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3a19c354edd6..ed7b5d1e12f4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1698,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, int cpu_file = iter->cpu_file; u64 next_ts = 0, ts; int next_cpu = -1; + int next_size = 0; int cpu; /* @@ -1729,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, next_cpu = cpu; next_ts = ts; next_lost = lost_events; + next_size = iter->ent_size; } } + iter->ent_size = next_size; + if (ent_cpu) *ent_cpu = next_cpu; -- cgit v1.2.3 From 160594e99dbbb0a5600ad922c630952c7c1c14bf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 28 Mar 2012 13:46:09 +0300 Subject: cpusets: Remove an unused variable We don't use "cpu" any more after 2baab4e904 "sched: Fix select_fallback_rq() vs cpu_active/cpu_online". Signed-off-by: Dan Carpenter Cc: Paul Menage Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20120328104608.GD29022@elgon.mountain Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c9837b74ab96..4ef4d7ecb9fb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2198,7 +2198,6 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { const struct cpuset *cs; - int cpu; rcu_read_lock(); cs = task_cs(tsk); -- cgit v1.2.3 From d550bbd40c0e10aefa05103dadbe0ae42e683707 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Disintegrate asm/system.h for Sparc Disintegrate asm/system.h for Sparc. Signed-off-by: David Howells cc: sparclinux@vger.kernel.org --- arch/sparc/include/asm/atomic_32.h | 2 +- arch/sparc/include/asm/atomic_64.h | 3 +- arch/sparc/include/asm/auxio_32.h | 1 - arch/sparc/include/asm/barrier.h | 8 + arch/sparc/include/asm/barrier_32.h | 15 ++ arch/sparc/include/asm/barrier_64.h | 56 ++++++ arch/sparc/include/asm/bug.h | 3 + arch/sparc/include/asm/cacheflush_32.h | 9 + arch/sparc/include/asm/cacheflush_64.h | 10 + arch/sparc/include/asm/cmpxchg.h | 8 + arch/sparc/include/asm/cmpxchg_32.h | 112 +++++++++++ arch/sparc/include/asm/cmpxchg_64.h | 145 ++++++++++++++ arch/sparc/include/asm/cpu_type.h | 34 ++++ arch/sparc/include/asm/exec.h | 6 + arch/sparc/include/asm/floppy_32.h | 1 - arch/sparc/include/asm/futex_64.h | 1 - arch/sparc/include/asm/io_32.h | 1 - arch/sparc/include/asm/io_64.h | 1 - arch/sparc/include/asm/irqflags_32.h | 1 + arch/sparc/include/asm/mmu_context_64.h | 1 - arch/sparc/include/asm/ns87303.h | 1 - arch/sparc/include/asm/perfctr.h | 23 +++ arch/sparc/include/asm/pgtable_32.h | 2 +- arch/sparc/include/asm/pgtable_64.h | 1 - arch/sparc/include/asm/processor.h | 3 + arch/sparc/include/asm/processor_64.h | 3 + arch/sparc/include/asm/ptrace.h | 5 +- arch/sparc/include/asm/setup.h | 16 ++ arch/sparc/include/asm/switch_to.h | 8 + arch/sparc/include/asm/switch_to_32.h | 106 ++++++++++ arch/sparc/include/asm/switch_to_64.h | 72 +++++++ arch/sparc/include/asm/system.h | 14 +- arch/sparc/include/asm/system_32.h | 284 --------------------------- arch/sparc/include/asm/system_64.h | 331 -------------------------------- arch/sparc/include/asm/timer_32.h | 3 +- arch/sparc/include/asm/uaccess_64.h | 1 - arch/sparc/kernel/auxio_32.c | 1 + arch/sparc/kernel/devices.c | 2 +- arch/sparc/kernel/irq.h | 1 + arch/sparc/kernel/irq_64.c | 1 - arch/sparc/kernel/kgdb_32.c | 1 + arch/sparc/kernel/module.c | 1 + arch/sparc/kernel/muldiv.c | 1 - arch/sparc/kernel/nmi.c | 1 + arch/sparc/kernel/pcr.c | 1 + arch/sparc/kernel/perf_event.c | 2 + arch/sparc/kernel/process_32.c | 2 +- arch/sparc/kernel/process_64.c | 1 - arch/sparc/kernel/ptrace_32.c | 2 +- arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/reboot.c | 2 +- arch/sparc/kernel/setup_32.c | 2 +- arch/sparc/kernel/setup_64.c | 2 +- arch/sparc/kernel/signal32.c | 1 + arch/sparc/kernel/signal_32.c | 1 + arch/sparc/kernel/signal_64.c | 2 + arch/sparc/kernel/sigutil_32.c | 1 + arch/sparc/kernel/sigutil_64.c | 1 + arch/sparc/kernel/sparc_ksyms_64.c | 2 +- arch/sparc/kernel/time_32.c | 1 - arch/sparc/kernel/traps_32.c | 1 - arch/sparc/kernel/traps_64.c | 2 +- arch/sparc/kernel/unaligned_32.c | 1 - arch/sparc/kernel/unaligned_64.c | 2 +- arch/sparc/kernel/visemul.c | 2 +- arch/sparc/math-emu/math_64.c | 1 + arch/sparc/mm/btfixup.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 1 - arch/sparc/mm/loadmmu.c | 1 - arch/sparc/mm/tsb.c | 1 - arch/sparc/prom/console_32.c | 1 - arch/sparc/prom/console_64.c | 1 - arch/sparc/prom/misc_32.c | 1 - arch/sparc/prom/misc_64.c | 1 - arch/sparc/prom/p1275.c | 1 - arch/sparc/prom/ranges.c | 1 - drivers/tty/serial/sunhv.c | 1 + drivers/tty/serial/sunsab.c | 1 + drivers/tty/serial/sunsu.c | 1 + drivers/tty/serial/sunzilog.c | 1 + kernel/signal.c | 1 + kernel/sysctl.c | 3 + 84 files changed, 684 insertions(+), 669 deletions(-) create mode 100644 arch/sparc/include/asm/barrier.h create mode 100644 arch/sparc/include/asm/barrier_32.h create mode 100644 arch/sparc/include/asm/barrier_64.h create mode 100644 arch/sparc/include/asm/cmpxchg.h create mode 100644 arch/sparc/include/asm/cmpxchg_32.h create mode 100644 arch/sparc/include/asm/cmpxchg_64.h create mode 100644 arch/sparc/include/asm/cpu_type.h create mode 100644 arch/sparc/include/asm/exec.h create mode 100644 arch/sparc/include/asm/switch_to.h create mode 100644 arch/sparc/include/asm/switch_to_32.h create mode 100644 arch/sparc/include/asm/switch_to_64.h delete mode 100644 arch/sparc/include/asm/system_32.h delete mode 100644 arch/sparc/include/asm/system_64.h (limited to 'kernel') diff --git a/arch/sparc/include/asm/atomic_32.h b/arch/sparc/include/asm/atomic_32.h index 9dd0a769fa18..905832aa9e9e 100644 --- a/arch/sparc/include/asm/atomic_32.h +++ b/arch/sparc/include/asm/atomic_32.h @@ -13,9 +13,9 @@ #include +#include #include -#include #define ATOMIC_INIT(i) { (i) } diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 9f421df46aec..ce35a1cf1a20 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -8,7 +8,7 @@ #define __ARCH_SPARC64_ATOMIC__ #include -#include +#include #define ATOMIC_INIT(i) { (i) } #define ATOMIC64_INIT(i) { (i) } @@ -85,7 +85,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return c; } - #define atomic64_cmpxchg(v, o, n) \ ((__typeof__((v)->counter))cmpxchg(&((v)->counter), (o), (n))) #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) diff --git a/arch/sparc/include/asm/auxio_32.h b/arch/sparc/include/asm/auxio_32.h index e03e088be95f..3a319775ae37 100644 --- a/arch/sparc/include/asm/auxio_32.h +++ b/arch/sparc/include/asm/auxio_32.h @@ -6,7 +6,6 @@ #ifndef _SPARC_AUXIO_H #define _SPARC_AUXIO_H -#include #include /* This register is an unsigned char in IO space. It does two things. diff --git a/arch/sparc/include/asm/barrier.h b/arch/sparc/include/asm/barrier.h new file mode 100644 index 000000000000..b25f02a029e0 --- /dev/null +++ b/arch/sparc/include/asm/barrier.h @@ -0,0 +1,8 @@ +#ifndef ___ASM_SPARC_BARRIER_H +#define ___ASM_SPARC_BARRIER_H +#if defined(__sparc__) && defined(__arch64__) +#include +#else +#include +#endif +#endif diff --git a/arch/sparc/include/asm/barrier_32.h b/arch/sparc/include/asm/barrier_32.h new file mode 100644 index 000000000000..c1b76654ee76 --- /dev/null +++ b/arch/sparc/include/asm/barrier_32.h @@ -0,0 +1,15 @@ +#ifndef __SPARC_BARRIER_H +#define __SPARC_BARRIER_H + +/* XXX Change this if we ever use a PSO mode kernel. */ +#define mb() __asm__ __volatile__ ("" : : : "memory") +#define rmb() mb() +#define wmb() mb() +#define read_barrier_depends() do { } while(0) +#define set_mb(__var, __value) do { __var = __value; mb(); } while(0) +#define smp_mb() __asm__ __volatile__("":::"memory") +#define smp_rmb() __asm__ __volatile__("":::"memory") +#define smp_wmb() __asm__ __volatile__("":::"memory") +#define smp_read_barrier_depends() do { } while(0) + +#endif /* !(__SPARC_BARRIER_H) */ diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h new file mode 100644 index 000000000000..95d45986f908 --- /dev/null +++ b/arch/sparc/include/asm/barrier_64.h @@ -0,0 +1,56 @@ +#ifndef __SPARC64_BARRIER_H +#define __SPARC64_BARRIER_H + +/* These are here in an effort to more fully work around Spitfire Errata + * #51. Essentially, if a memory barrier occurs soon after a mispredicted + * branch, the chip can stop executing instructions until a trap occurs. + * Therefore, if interrupts are disabled, the chip can hang forever. + * + * It used to be believed that the memory barrier had to be right in the + * delay slot, but a case has been traced recently wherein the memory barrier + * was one instruction after the branch delay slot and the chip still hung. + * The offending sequence was the following in sym_wakeup_done() of the + * sym53c8xx_2 driver: + * + * call sym_ccb_from_dsa, 0 + * movge %icc, 0, %l0 + * brz,pn %o0, .LL1303 + * mov %o0, %l2 + * membar #LoadLoad + * + * The branch has to be mispredicted for the bug to occur. Therefore, we put + * the memory barrier explicitly into a "branch always, predicted taken" + * delay slot to avoid the problem case. + */ +#define membar_safe(type) \ +do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ + " membar " type "\n" \ + "1:\n" \ + : : : "memory"); \ +} while (0) + +/* The kernel always executes in TSO memory model these days, + * and furthermore most sparc64 chips implement more stringent + * memory ordering than required by the specifications. + */ +#define mb() membar_safe("#StoreLoad") +#define rmb() __asm__ __volatile__("":::"memory") +#define wmb() __asm__ __volatile__("":::"memory") + +#define read_barrier_depends() do { } while(0) +#define set_mb(__var, __value) \ + do { __var = __value; membar_safe("#StoreLoad"); } while(0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#define smp_rmb() rmb() +#define smp_wmb() wmb() +#else +#define smp_mb() __asm__ __volatile__("":::"memory") +#define smp_rmb() __asm__ __volatile__("":::"memory") +#define smp_wmb() __asm__ __volatile__("":::"memory") +#endif + +#define smp_read_barrier_depends() do { } while(0) + +#endif /* !(__SPARC64_BARRIER_H) */ diff --git a/arch/sparc/include/asm/bug.h b/arch/sparc/include/asm/bug.h index 8a59e5a8c217..6bd9f43cb5a5 100644 --- a/arch/sparc/include/asm/bug.h +++ b/arch/sparc/include/asm/bug.h @@ -19,4 +19,7 @@ extern void do_BUG(const char *file, int line); #include +struct pt_regs; +extern void die_if_kernel(char *str, struct pt_regs *regs) __attribute__ ((noreturn)); + #endif diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index 2e468773f250..68431b47a22a 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -83,4 +83,13 @@ extern void sparc_flush_page_to_ram(struct page *page); #define flush_cache_vmap(start, end) flush_cache_all() #define flush_cache_vunmap(start, end) flush_cache_all() +/* When a context switch happens we must flush all user windows so that + * the windows of the current process are flushed onto its stack. This + * way the windows are all clean for the next process and the stack + * frames are up to date. + */ +extern void flush_user_windows(void); +extern void kill_user_windows(void); +extern void flushw_all(void); + #endif /* _SPARC_CACHEFLUSH_H */ diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index b95384033e89..2efea2ff88b7 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -9,6 +9,16 @@ /* Cache flush operations. */ + +#define flushi(addr) __asm__ __volatile__ ("flush %0" : : "r" (addr) : "memory") +#define flushw_all() __asm__ __volatile__("flushw") + +extern void __flushw_user(void); +#define flushw_user() __flushw_user() + +#define flush_user_windows flushw_user +#define flush_register_windows flushw_all + /* These are the same regardless of whether this is an SMP kernel or not. */ #define flush_cache_mm(__mm) \ do { if ((__mm) == current->mm) flushw_user(); } while(0) diff --git a/arch/sparc/include/asm/cmpxchg.h b/arch/sparc/include/asm/cmpxchg.h new file mode 100644 index 000000000000..9355893efa52 --- /dev/null +++ b/arch/sparc/include/asm/cmpxchg.h @@ -0,0 +1,8 @@ +#ifndef ___ASM_SPARC_CMPXCHG_H +#define ___ASM_SPARC_CMPXCHG_H +#if defined(__sparc__) && defined(__arch64__) +#include +#else +#include +#endif +#endif diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h new file mode 100644 index 000000000000..c786b0a92b51 --- /dev/null +++ b/arch/sparc/include/asm/cmpxchg_32.h @@ -0,0 +1,112 @@ +/* 32-bit atomic xchg() and cmpxchg() definitions. + * + * Copyright (C) 1996 David S. Miller (davem@davemloft.net) + * Copyright (C) 2000 Anton Blanchard (anton@linuxcare.com.au) + * Copyright (C) 2007 Kyle McMartin (kyle@parisc-linux.org) + * + * Additions by Keith M Wesolowski (wesolows@foobazco.org) based + * on asm-parisc/atomic.h Copyright (C) 2000 Philipp Rumpf . + */ + +#ifndef __ARCH_SPARC_CMPXCHG__ +#define __ARCH_SPARC_CMPXCHG__ + +#include + +/* This has special calling conventions */ +#ifndef CONFIG_SMP +BTFIXUPDEF_CALL(void, ___xchg32, void) +#endif + +static inline unsigned long xchg_u32(__volatile__ unsigned long *m, unsigned long val) +{ +#ifdef CONFIG_SMP + __asm__ __volatile__("swap [%2], %0" + : "=&r" (val) + : "0" (val), "r" (m) + : "memory"); + return val; +#else + register unsigned long *ptr asm("g1"); + register unsigned long ret asm("g2"); + + ptr = (unsigned long *) m; + ret = val; + + /* Note: this is magic and the nop there is + really needed. */ + __asm__ __volatile__( + "mov %%o7, %%g4\n\t" + "call ___f____xchg32\n\t" + " nop\n\t" + : "=&r" (ret) + : "0" (ret), "r" (ptr) + : "g3", "g4", "g7", "memory", "cc"); + + return ret; +#endif +} + +extern void __xchg_called_with_bad_pointer(void); + +static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int size) +{ + switch (size) { + case 4: + return xchg_u32(ptr, x); + } + __xchg_called_with_bad_pointer(); + return x; +} + +#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) + +/* Emulate cmpxchg() the same way we emulate atomics, + * by hashing the object address and indexing into an array + * of spinlocks to get a bit of performance... + * + * See arch/sparc/lib/atomic32.c for implementation. + * + * Cribbed from + */ +#define __HAVE_ARCH_CMPXCHG 1 + +/* bug catcher for when unsupported size is used - won't link */ +extern void __cmpxchg_called_with_bad_pointer(void); +/* we only need to support cmpxchg of a u32 on sparc */ +extern unsigned long __cmpxchg_u32(volatile u32 *m, u32 old, u32 new_); + +/* don't worry...optimizer will get rid of most of this */ +static inline unsigned long +__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size) +{ + switch (size) { + case 4: + return __cmpxchg_u32((u32 *)ptr, (u32)old, (u32)new_); + default: + __cmpxchg_called_with_bad_pointer(); + break; + } + return old; +} + +#define cmpxchg(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ +}) + +#include + +/* + * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make + * them available. + */ +#define cmpxchg_local(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), (unsigned long)(o),\ + (unsigned long)(n), sizeof(*(ptr)))) +#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) + +#endif /* __ARCH_SPARC_CMPXCHG__ */ diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h new file mode 100644 index 000000000000..b30eb37294c5 --- /dev/null +++ b/arch/sparc/include/asm/cmpxchg_64.h @@ -0,0 +1,145 @@ +/* 64-bit atomic xchg() and cmpxchg() definitions. + * + * Copyright (C) 1996, 1997, 2000 David S. Miller (davem@redhat.com) + */ + +#ifndef __ARCH_SPARC64_CMPXCHG__ +#define __ARCH_SPARC64_CMPXCHG__ + +static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) +{ + unsigned long tmp1, tmp2; + + __asm__ __volatile__( +" mov %0, %1\n" +"1: lduw [%4], %2\n" +" cas [%4], %2, %0\n" +" cmp %2, %0\n" +" bne,a,pn %%icc, 1b\n" +" mov %1, %0\n" + : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2) + : "0" (val), "r" (m) + : "cc", "memory"); + return val; +} + +static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long val) +{ + unsigned long tmp1, tmp2; + + __asm__ __volatile__( +" mov %0, %1\n" +"1: ldx [%4], %2\n" +" casx [%4], %2, %0\n" +" cmp %2, %0\n" +" bne,a,pn %%xcc, 1b\n" +" mov %1, %0\n" + : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2) + : "0" (val), "r" (m) + : "cc", "memory"); + return val; +} + +#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) + +extern void __xchg_called_with_bad_pointer(void); + +static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, + int size) +{ + switch (size) { + case 4: + return xchg32(ptr, x); + case 8: + return xchg64(ptr, x); + } + __xchg_called_with_bad_pointer(); + return x; +} + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ + +#include + +#define __HAVE_ARCH_CMPXCHG 1 + +static inline unsigned long +__cmpxchg_u32(volatile int *m, int old, int new) +{ + __asm__ __volatile__("cas [%2], %3, %0" + : "=&r" (new) + : "0" (new), "r" (m), "r" (old) + : "memory"); + + return new; +} + +static inline unsigned long +__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new) +{ + __asm__ __volatile__("casx [%2], %3, %0" + : "=&r" (new) + : "0" (new), "r" (m), "r" (old) + : "memory"); + + return new; +} + +/* This function doesn't exist, so you'll get a linker error + if something tries to do an invalid cmpxchg(). */ +extern void __cmpxchg_called_with_bad_pointer(void); + +static inline unsigned long +__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) +{ + switch (size) { + case 4: + return __cmpxchg_u32(ptr, old, new); + case 8: + return __cmpxchg_u64(ptr, old, new); + } + __cmpxchg_called_with_bad_pointer(); + return old; +} + +#define cmpxchg(ptr,o,n) \ + ({ \ + __typeof__(*(ptr)) _o_ = (o); \ + __typeof__(*(ptr)) _n_ = (n); \ + (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ + (unsigned long)_n_, sizeof(*(ptr))); \ + }) + +/* + * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make + * them available. + */ + +static inline unsigned long __cmpxchg_local(volatile void *ptr, + unsigned long old, + unsigned long new, int size) +{ + switch (size) { + case 4: + case 8: return __cmpxchg(ptr, old, new, size); + default: + return __cmpxchg_local_generic(ptr, old, new, size); + } + + return old; +} + +#define cmpxchg_local(ptr, o, n) \ + ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ + (unsigned long)(n), sizeof(*(ptr)))) +#define cmpxchg64_local(ptr, o, n) \ + ({ \ + BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ + cmpxchg_local((ptr), (o), (n)); \ + }) + +#endif /* __ARCH_SPARC64_CMPXCHG__ */ diff --git a/arch/sparc/include/asm/cpu_type.h b/arch/sparc/include/asm/cpu_type.h new file mode 100644 index 000000000000..4ca184d95d82 --- /dev/null +++ b/arch/sparc/include/asm/cpu_type.h @@ -0,0 +1,34 @@ +#ifndef __ASM_CPU_TYPE_H +#define __ASM_CPU_TYPE_H + +/* + * Sparc (general) CPU types + */ +enum sparc_cpu { + sun4 = 0x00, + sun4c = 0x01, + sun4m = 0x02, + sun4d = 0x03, + sun4e = 0x04, + sun4u = 0x05, /* V8 ploos ploos */ + sun_unknown = 0x06, + ap1000 = 0x07, /* almost a sun4m */ + sparc_leon = 0x08, /* Leon SoC */ +}; + +#ifdef CONFIG_SPARC32 +extern enum sparc_cpu sparc_cpu_model; + +#define ARCH_SUN4C (sparc_cpu_model==sun4c) + +#define SUN4M_NCPUS 4 /* Architectural limit of sun4m. */ + +#else + +#define sparc_cpu_model sun4u + +/* This cannot ever be a sun4c :) That's just history. */ +#define ARCH_SUN4C 0 +#endif + +#endif /* __ASM_CPU_TYPE_H */ diff --git a/arch/sparc/include/asm/exec.h b/arch/sparc/include/asm/exec.h new file mode 100644 index 000000000000..2e085881e0d1 --- /dev/null +++ b/arch/sparc/include/asm/exec.h @@ -0,0 +1,6 @@ +#ifndef __SPARC_EXEC_H +#define __SPARC_EXEC_H + +#define arch_align_stack(x) (x) + +#endif /* __SPARC_EXEC_H */ diff --git a/arch/sparc/include/asm/floppy_32.h b/arch/sparc/include/asm/floppy_32.h index 7440915e86d8..698d9559fead 100644 --- a/arch/sparc/include/asm/floppy_32.h +++ b/arch/sparc/include/asm/floppy_32.h @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/include/asm/futex_64.h b/arch/sparc/include/asm/futex_64.h index 444e7bea23bc..4e899b0dabf7 100644 --- a/arch/sparc/include/asm/futex_64.h +++ b/arch/sparc/include/asm/futex_64.h @@ -4,7 +4,6 @@ #include #include #include -#include #define __futex_cas_op(insn, ret, oldval, uaddr, oparg) \ __asm__ __volatile__( \ diff --git a/arch/sparc/include/asm/io_32.h b/arch/sparc/include/asm/io_32.h index 2006e5d359df..c1acbd891cbc 100644 --- a/arch/sparc/include/asm/io_32.h +++ b/arch/sparc/include/asm/io_32.h @@ -6,7 +6,6 @@ #include /* struct resource */ #include /* IO address mapping routines need this */ -#include #include #define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT) diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h index 9481e5a6fa90..09b0b88aeb2a 100644 --- a/arch/sparc/include/asm/io_64.h +++ b/arch/sparc/include/asm/io_64.h @@ -6,7 +6,6 @@ #include #include /* IO address mapping routines need this */ -#include #include #include diff --git a/arch/sparc/include/asm/irqflags_32.h b/arch/sparc/include/asm/irqflags_32.h index 14848909e0de..e414c06615c1 100644 --- a/arch/sparc/include/asm/irqflags_32.h +++ b/arch/sparc/include/asm/irqflags_32.h @@ -13,6 +13,7 @@ #ifndef __ASSEMBLY__ #include +#include extern void arch_local_irq_restore(unsigned long); extern unsigned long arch_local_irq_save(void); diff --git a/arch/sparc/include/asm/mmu_context_64.h b/arch/sparc/include/asm/mmu_context_64.h index 666a73fef28d..a97fd085cebe 100644 --- a/arch/sparc/include/asm/mmu_context_64.h +++ b/arch/sparc/include/asm/mmu_context_64.h @@ -6,7 +6,6 @@ #ifndef __ASSEMBLY__ #include -#include #include #include diff --git a/arch/sparc/include/asm/ns87303.h b/arch/sparc/include/asm/ns87303.h index af755483e17d..6b947ee0f6aa 100644 --- a/arch/sparc/include/asm/ns87303.h +++ b/arch/sparc/include/asm/ns87303.h @@ -79,7 +79,6 @@ #include -#include #include extern spinlock_t ns87303_lock; diff --git a/arch/sparc/include/asm/perfctr.h b/arch/sparc/include/asm/perfctr.h index 8d8720a8770d..3332d2cba6c1 100644 --- a/arch/sparc/include/asm/perfctr.h +++ b/arch/sparc/include/asm/perfctr.h @@ -168,6 +168,29 @@ struct vcounter_struct { unsigned long long vcnt1; }; +#else /* !(__KERNEL__) */ + +#ifndef CONFIG_SPARC32 + +/* Performance counter register access. */ +#define read_pcr(__p) __asm__ __volatile__("rd %%pcr, %0" : "=r" (__p)) +#define write_pcr(__p) __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (__p)) +#define read_pic(__p) __asm__ __volatile__("rd %%pic, %0" : "=r" (__p)) + +/* Blackbird errata workaround. See commentary in + * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt() + * for more information. + */ +#define write_pic(__p) \ + __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" \ + " nop\n\t" \ + ".align 64\n" \ + "99:wr %0, 0x0, %%pic\n\t" \ + "rd %%pic, %%g0" : : "r" (__p)) +#define reset_pic() write_pic(0) + +#endif /* !CONFIG_SPARC32 */ + #endif /* !(__KERNEL__) */ #endif /* !(PERF_COUNTER_API) */ diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index a790cc657476..3d7101860e68 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include struct vm_area_struct; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 38ebb2c60137..6fa2f7980e6b 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/arch/sparc/include/asm/processor.h b/arch/sparc/include/asm/processor.h index 9da9646bf6c6..2fe99e66e760 100644 --- a/arch/sparc/include/asm/processor.h +++ b/arch/sparc/include/asm/processor.h @@ -5,4 +5,7 @@ #else #include #endif + +#define nop() __asm__ __volatile__ ("nop") + #endif diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index 59fcebb8f440..e713db249931 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -18,6 +18,9 @@ #include #include +/* Don't hold the runqueue lock over context switch */ +#define __ARCH_WANT_UNLOCKED_CTXSW + /* The sparc has no problems with write protection */ #define wp_works_ok 1 #define wp_works_ok__is_a_macro /* for versions in ksyms.c */ diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index c00c3b5c2806..ef8c7c068f53 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -98,6 +98,8 @@ struct sparc_trapf { */ #ifndef __ASSEMBLY__ +#include + struct pt_regs { unsigned long psr; unsigned long pc; @@ -163,7 +165,6 @@ struct sparc_stackf { #ifdef __KERNEL__ #include -#include static inline int pt_regs_trap_type(struct pt_regs *regs) { @@ -240,8 +241,6 @@ extern unsigned long profile_pc(struct pt_regs *); #ifdef __KERNEL__ -#include - static inline bool pt_regs_is_syscall(struct pt_regs *regs) { return (regs->psr & PSR_SYSCALL); diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index 64718ba26434..00497abec996 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -13,14 +13,30 @@ #ifdef __KERNEL__ +extern char reboot_command[]; + #ifdef CONFIG_SPARC32 /* The CPU that was used for booting * Only sun4d + leon may have boot_cpu_id != 0 */ extern unsigned char boot_cpu_id; extern unsigned char boot_cpu_id4; + +extern unsigned long empty_bad_page; +extern unsigned long empty_bad_page_table; +extern unsigned long empty_zero_page; + +extern int serial_console; +static inline int con_is_present(void) +{ + return serial_console ? 0 : 1; +} #endif +extern void sun_do_break(void); +extern int stop_a_enabled; +extern int scons_pwroff; + #endif /* __KERNEL__ */ #endif /* _SPARC_SETUP_H */ diff --git a/arch/sparc/include/asm/switch_to.h b/arch/sparc/include/asm/switch_to.h new file mode 100644 index 000000000000..2dc4fa5c6f8c --- /dev/null +++ b/arch/sparc/include/asm/switch_to.h @@ -0,0 +1,8 @@ +#ifndef ___ASM_SPARC_SWITCH_TO_H +#define ___ASM_SPARC_SWITCH_TO_H +#if defined(__sparc__) && defined(__arch64__) +#include +#else +#include +#endif +#endif diff --git a/arch/sparc/include/asm/switch_to_32.h b/arch/sparc/include/asm/switch_to_32.h new file mode 100644 index 000000000000..e32e82b76eed --- /dev/null +++ b/arch/sparc/include/asm/switch_to_32.h @@ -0,0 +1,106 @@ +#ifndef __SPARC_SWITCH_TO_H +#define __SPARC_SWITCH_TO_H + +#include + +extern struct thread_info *current_set[NR_CPUS]; + +/* + * Flush windows so that the VM switch which follows + * would not pull the stack from under us. + * + * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work) + * XXX WTF is the above comment? Found in late teen 2.4.x. + */ +#ifdef CONFIG_SMP +#define SWITCH_ENTER(prv) \ + do { \ + if (test_tsk_thread_flag(prv, TIF_USEDFPU)) { \ + put_psr(get_psr() | PSR_EF); \ + fpsave(&(prv)->thread.float_regs[0], &(prv)->thread.fsr, \ + &(prv)->thread.fpqueue[0], &(prv)->thread.fpqdepth); \ + clear_tsk_thread_flag(prv, TIF_USEDFPU); \ + (prv)->thread.kregs->psr &= ~PSR_EF; \ + } \ + } while(0) + +#define SWITCH_DO_LAZY_FPU(next) /* */ +#else +#define SWITCH_ENTER(prv) /* */ +#define SWITCH_DO_LAZY_FPU(nxt) \ + do { \ + if (last_task_used_math != (nxt)) \ + (nxt)->thread.kregs->psr&=~PSR_EF; \ + } while(0) +#endif + +#define prepare_arch_switch(next) do { \ + __asm__ __volatile__( \ + ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \ + "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \ + "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \ + "save %sp, -0x40, %sp\n\t" \ + "restore; restore; restore; restore; restore; restore; restore"); \ +} while(0) + + /* Much care has gone into this code, do not touch it. + * + * We need to loadup regs l0/l1 for the newly forked child + * case because the trap return path relies on those registers + * holding certain values, gcc is told that they are clobbered. + * Gcc needs registers for 3 values in and 1 value out, so we + * clobber every non-fixed-usage register besides l2/l3/o4/o5. -DaveM + * + * Hey Dave, that do not touch sign is too much of an incentive + * - Anton & Pete + */ +#define switch_to(prev, next, last) do { \ + SWITCH_ENTER(prev); \ + SWITCH_DO_LAZY_FPU(next); \ + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next->active_mm)); \ + __asm__ __volatile__( \ + "sethi %%hi(here - 0x8), %%o7\n\t" \ + "mov %%g6, %%g3\n\t" \ + "or %%o7, %%lo(here - 0x8), %%o7\n\t" \ + "rd %%psr, %%g4\n\t" \ + "std %%sp, [%%g6 + %4]\n\t" \ + "rd %%wim, %%g5\n\t" \ + "wr %%g4, 0x20, %%psr\n\t" \ + "nop\n\t" \ + "std %%g4, [%%g6 + %3]\n\t" \ + "ldd [%2 + %3], %%g4\n\t" \ + "mov %2, %%g6\n\t" \ + ".globl patchme_store_new_current\n" \ +"patchme_store_new_current:\n\t" \ + "st %2, [%1]\n\t" \ + "wr %%g4, 0x20, %%psr\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "nop\n\t" /* LEON needs all 3 nops: load to %sp depends on CWP. */ \ + "ldd [%%g6 + %4], %%sp\n\t" \ + "wr %%g5, 0x0, %%wim\n\t" \ + "ldd [%%sp + 0x00], %%l0\n\t" \ + "ldd [%%sp + 0x38], %%i6\n\t" \ + "wr %%g4, 0x0, %%psr\n\t" \ + "nop\n\t" \ + "nop\n\t" \ + "jmpl %%o7 + 0x8, %%g0\n\t" \ + " ld [%%g3 + %5], %0\n\t" \ + "here:\n" \ + : "=&r" (last) \ + : "r" (&(current_set[hard_smp_processor_id()])), \ + "r" (task_thread_info(next)), \ + "i" (TI_KPSR), \ + "i" (TI_KSP), \ + "i" (TI_TASK) \ + : "g1", "g2", "g3", "g4", "g5", "g7", \ + "l0", "l1", "l3", "l4", "l5", "l6", "l7", \ + "i0", "i1", "i2", "i3", "i4", "i5", \ + "o0", "o1", "o2", "o3", "o7"); \ + } while(0) + +extern void fpsave(unsigned long *fpregs, unsigned long *fsr, + void *fpqueue, unsigned long *fpqdepth); +extern void synchronize_user_stack(void); + +#endif /* __SPARC_SWITCH_TO_H */ diff --git a/arch/sparc/include/asm/switch_to_64.h b/arch/sparc/include/asm/switch_to_64.h new file mode 100644 index 000000000000..7923c4a2be38 --- /dev/null +++ b/arch/sparc/include/asm/switch_to_64.h @@ -0,0 +1,72 @@ +#ifndef __SPARC64_SWITCH_TO_64_H +#define __SPARC64_SWITCH_TO_64_H + +#include + +#define prepare_arch_switch(next) \ +do { \ + flushw_all(); \ +} while (0) + + /* See what happens when you design the chip correctly? + * + * We tell gcc we clobber all non-fixed-usage registers except + * for l0/l1. It will use one for 'next' and the other to hold + * the output value of 'last'. 'next' is not referenced again + * past the invocation of switch_to in the scheduler, so we need + * not preserve it's value. Hairy, but it lets us remove 2 loads + * and 2 stores in this critical code path. -DaveM + */ +#define switch_to(prev, next, last) \ +do { flush_tlb_pending(); \ + save_and_clear_fpu(); \ + /* If you are tempted to conditionalize the following */ \ + /* so that ASI is only written if it changes, think again. */ \ + __asm__ __volatile__("wr %%g0, %0, %%asi" \ + : : "r" (__thread_flag_byte_ptr(task_thread_info(next))[TI_FLAG_BYTE_CURRENT_DS]));\ + trap_block[current_thread_info()->cpu].thread = \ + task_thread_info(next); \ + __asm__ __volatile__( \ + "mov %%g4, %%g7\n\t" \ + "stx %%i6, [%%sp + 2047 + 0x70]\n\t" \ + "stx %%i7, [%%sp + 2047 + 0x78]\n\t" \ + "rdpr %%wstate, %%o5\n\t" \ + "stx %%o6, [%%g6 + %6]\n\t" \ + "stb %%o5, [%%g6 + %5]\n\t" \ + "rdpr %%cwp, %%o5\n\t" \ + "stb %%o5, [%%g6 + %8]\n\t" \ + "wrpr %%g0, 15, %%pil\n\t" \ + "mov %4, %%g6\n\t" \ + "ldub [%4 + %8], %%g1\n\t" \ + "wrpr %%g1, %%cwp\n\t" \ + "ldx [%%g6 + %6], %%o6\n\t" \ + "ldub [%%g6 + %5], %%o5\n\t" \ + "ldub [%%g6 + %7], %%o7\n\t" \ + "wrpr %%o5, 0x0, %%wstate\n\t" \ + "ldx [%%sp + 2047 + 0x70], %%i6\n\t" \ + "ldx [%%sp + 2047 + 0x78], %%i7\n\t" \ + "ldx [%%g6 + %9], %%g4\n\t" \ + "wrpr %%g0, 14, %%pil\n\t" \ + "brz,pt %%o7, switch_to_pc\n\t" \ + " mov %%g7, %0\n\t" \ + "sethi %%hi(ret_from_syscall), %%g1\n\t" \ + "jmpl %%g1 + %%lo(ret_from_syscall), %%g0\n\t" \ + " nop\n\t" \ + ".globl switch_to_pc\n\t" \ + "switch_to_pc:\n\t" \ + : "=&r" (last), "=r" (current), "=r" (current_thread_info_reg), \ + "=r" (__local_per_cpu_offset) \ + : "0" (task_thread_info(next)), \ + "i" (TI_WSTATE), "i" (TI_KSP), "i" (TI_NEW_CHILD), \ + "i" (TI_CWP), "i" (TI_TASK) \ + : "cc", \ + "g1", "g2", "g3", "g7", \ + "l1", "l2", "l3", "l4", "l5", "l6", "l7", \ + "i0", "i1", "i2", "i3", "i4", "i5", \ + "o0", "o1", "o2", "o3", "o4", "o5", "o7"); \ +} while(0) + +extern void synchronize_user_stack(void); +extern void fault_in_user_windows(void); + +#endif /* __SPARC64_SWITCH_TO_64_H */ diff --git a/arch/sparc/include/asm/system.h b/arch/sparc/include/asm/system.h index 7944a7cfc996..ed532ba000b1 100644 --- a/arch/sparc/include/asm/system.h +++ b/arch/sparc/include/asm/system.h @@ -1,8 +1,6 @@ -#ifndef ___ASM_SPARC_SYSTEM_H -#define ___ASM_SPARC_SYSTEM_H -#if defined(__sparc__) && defined(__arch64__) -#include -#else -#include -#endif -#endif +/* FILE TO BE DELETED. DO NOT ADD STUFF HERE! */ +#include +#include +#include +#include +#include diff --git a/arch/sparc/include/asm/system_32.h b/arch/sparc/include/asm/system_32.h deleted file mode 100644 index aba16092a81b..000000000000 --- a/arch/sparc/include/asm/system_32.h +++ /dev/null @@ -1,284 +0,0 @@ -#ifndef __SPARC_SYSTEM_H -#define __SPARC_SYSTEM_H - -#include -#include /* NR_CPUS */ -#include - -#include -#include -#include -#include -#include - -#ifndef __ASSEMBLY__ - -#include - -/* - * Sparc (general) CPU types - */ -enum sparc_cpu { - sun4 = 0x00, - sun4c = 0x01, - sun4m = 0x02, - sun4d = 0x03, - sun4e = 0x04, - sun4u = 0x05, /* V8 ploos ploos */ - sun_unknown = 0x06, - ap1000 = 0x07, /* almost a sun4m */ - sparc_leon = 0x08, /* Leon SoC */ -}; - -/* Really, userland should not be looking at any of this... */ -#ifdef __KERNEL__ - -extern enum sparc_cpu sparc_cpu_model; - -#define ARCH_SUN4C (sparc_cpu_model==sun4c) - -#define SUN4M_NCPUS 4 /* Architectural limit of sun4m. */ - -extern char reboot_command[]; - -extern struct thread_info *current_set[NR_CPUS]; - -extern unsigned long empty_bad_page; -extern unsigned long empty_bad_page_table; -extern unsigned long empty_zero_page; - -extern void sun_do_break(void); -extern int serial_console; -extern int stop_a_enabled; -extern int scons_pwroff; - -static inline int con_is_present(void) -{ - return serial_console ? 0 : 1; -} - -/* When a context switch happens we must flush all user windows so that - * the windows of the current process are flushed onto its stack. This - * way the windows are all clean for the next process and the stack - * frames are up to date. - */ -extern void flush_user_windows(void); -extern void kill_user_windows(void); -extern void synchronize_user_stack(void); -extern void fpsave(unsigned long *fpregs, unsigned long *fsr, - void *fpqueue, unsigned long *fpqdepth); - -#ifdef CONFIG_SMP -#define SWITCH_ENTER(prv) \ - do { \ - if (test_tsk_thread_flag(prv, TIF_USEDFPU)) { \ - put_psr(get_psr() | PSR_EF); \ - fpsave(&(prv)->thread.float_regs[0], &(prv)->thread.fsr, \ - &(prv)->thread.fpqueue[0], &(prv)->thread.fpqdepth); \ - clear_tsk_thread_flag(prv, TIF_USEDFPU); \ - (prv)->thread.kregs->psr &= ~PSR_EF; \ - } \ - } while(0) - -#define SWITCH_DO_LAZY_FPU(next) /* */ -#else -#define SWITCH_ENTER(prv) /* */ -#define SWITCH_DO_LAZY_FPU(nxt) \ - do { \ - if (last_task_used_math != (nxt)) \ - (nxt)->thread.kregs->psr&=~PSR_EF; \ - } while(0) -#endif - -extern void flushw_all(void); - -/* - * Flush windows so that the VM switch which follows - * would not pull the stack from under us. - * - * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work) - * XXX WTF is the above comment? Found in late teen 2.4.x. - */ -#define prepare_arch_switch(next) do { \ - __asm__ __volatile__( \ - ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \ - "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \ - "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \ - "save %sp, -0x40, %sp\n\t" \ - "restore; restore; restore; restore; restore; restore; restore"); \ -} while(0) - - /* Much care has gone into this code, do not touch it. - * - * We need to loadup regs l0/l1 for the newly forked child - * case because the trap return path relies on those registers - * holding certain values, gcc is told that they are clobbered. - * Gcc needs registers for 3 values in and 1 value out, so we - * clobber every non-fixed-usage register besides l2/l3/o4/o5. -DaveM - * - * Hey Dave, that do not touch sign is too much of an incentive - * - Anton & Pete - */ -#define switch_to(prev, next, last) do { \ - SWITCH_ENTER(prev); \ - SWITCH_DO_LAZY_FPU(next); \ - cpumask_set_cpu(smp_processor_id(), mm_cpumask(next->active_mm)); \ - __asm__ __volatile__( \ - "sethi %%hi(here - 0x8), %%o7\n\t" \ - "mov %%g6, %%g3\n\t" \ - "or %%o7, %%lo(here - 0x8), %%o7\n\t" \ - "rd %%psr, %%g4\n\t" \ - "std %%sp, [%%g6 + %4]\n\t" \ - "rd %%wim, %%g5\n\t" \ - "wr %%g4, 0x20, %%psr\n\t" \ - "nop\n\t" \ - "std %%g4, [%%g6 + %3]\n\t" \ - "ldd [%2 + %3], %%g4\n\t" \ - "mov %2, %%g6\n\t" \ - ".globl patchme_store_new_current\n" \ -"patchme_store_new_current:\n\t" \ - "st %2, [%1]\n\t" \ - "wr %%g4, 0x20, %%psr\n\t" \ - "nop\n\t" \ - "nop\n\t" \ - "nop\n\t" /* LEON needs all 3 nops: load to %sp depends on CWP. */ \ - "ldd [%%g6 + %4], %%sp\n\t" \ - "wr %%g5, 0x0, %%wim\n\t" \ - "ldd [%%sp + 0x00], %%l0\n\t" \ - "ldd [%%sp + 0x38], %%i6\n\t" \ - "wr %%g4, 0x0, %%psr\n\t" \ - "nop\n\t" \ - "nop\n\t" \ - "jmpl %%o7 + 0x8, %%g0\n\t" \ - " ld [%%g3 + %5], %0\n\t" \ - "here:\n" \ - : "=&r" (last) \ - : "r" (&(current_set[hard_smp_processor_id()])), \ - "r" (task_thread_info(next)), \ - "i" (TI_KPSR), \ - "i" (TI_KSP), \ - "i" (TI_TASK) \ - : "g1", "g2", "g3", "g4", "g5", "g7", \ - "l0", "l1", "l3", "l4", "l5", "l6", "l7", \ - "i0", "i1", "i2", "i3", "i4", "i5", \ - "o0", "o1", "o2", "o3", "o7"); \ - } while(0) - -/* XXX Change this if we ever use a PSO mode kernel. */ -#define mb() __asm__ __volatile__ ("" : : : "memory") -#define rmb() mb() -#define wmb() mb() -#define read_barrier_depends() do { } while(0) -#define set_mb(__var, __value) do { __var = __value; mb(); } while(0) -#define smp_mb() __asm__ __volatile__("":::"memory") -#define smp_rmb() __asm__ __volatile__("":::"memory") -#define smp_wmb() __asm__ __volatile__("":::"memory") -#define smp_read_barrier_depends() do { } while(0) - -#define nop() __asm__ __volatile__ ("nop") - -/* This has special calling conventions */ -#ifndef CONFIG_SMP -BTFIXUPDEF_CALL(void, ___xchg32, void) -#endif - -static inline unsigned long xchg_u32(__volatile__ unsigned long *m, unsigned long val) -{ -#ifdef CONFIG_SMP - __asm__ __volatile__("swap [%2], %0" - : "=&r" (val) - : "0" (val), "r" (m) - : "memory"); - return val; -#else - register unsigned long *ptr asm("g1"); - register unsigned long ret asm("g2"); - - ptr = (unsigned long *) m; - ret = val; - - /* Note: this is magic and the nop there is - really needed. */ - __asm__ __volatile__( - "mov %%o7, %%g4\n\t" - "call ___f____xchg32\n\t" - " nop\n\t" - : "=&r" (ret) - : "0" (ret), "r" (ptr) - : "g3", "g4", "g7", "memory", "cc"); - - return ret; -#endif -} - -#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) - -extern void __xchg_called_with_bad_pointer(void); - -static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int size) -{ - switch (size) { - case 4: - return xchg_u32(ptr, x); - } - __xchg_called_with_bad_pointer(); - return x; -} - -/* Emulate cmpxchg() the same way we emulate atomics, - * by hashing the object address and indexing into an array - * of spinlocks to get a bit of performance... - * - * See arch/sparc/lib/atomic32.c for implementation. - * - * Cribbed from - */ -#define __HAVE_ARCH_CMPXCHG 1 - -/* bug catcher for when unsupported size is used - won't link */ -extern void __cmpxchg_called_with_bad_pointer(void); -/* we only need to support cmpxchg of a u32 on sparc */ -extern unsigned long __cmpxchg_u32(volatile u32 *m, u32 old, u32 new_); - -/* don't worry...optimizer will get rid of most of this */ -static inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new_, int size) -{ - switch (size) { - case 4: - return __cmpxchg_u32((u32 *)ptr, (u32)old, (u32)new_); - default: - __cmpxchg_called_with_bad_pointer(); - break; - } - return old; -} - -#define cmpxchg(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) _o_ = (o); \ - __typeof__(*(ptr)) _n_ = (n); \ - (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ - (unsigned long)_n_, sizeof(*(ptr))); \ -}) - -#include - -/* - * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make - * them available. - */ -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local_generic((ptr), (unsigned long)(o),\ - (unsigned long)(n), sizeof(*(ptr)))) -#define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n)) - -extern void die_if_kernel(char *str, struct pt_regs *regs) __attribute__ ((noreturn)); - -#endif /* __KERNEL__ */ - -#endif /* __ASSEMBLY__ */ - -#define arch_align_stack(x) (x) - -#endif /* !(__SPARC_SYSTEM_H) */ diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h deleted file mode 100644 index 10bcabce97b2..000000000000 --- a/arch/sparc/include/asm/system_64.h +++ /dev/null @@ -1,331 +0,0 @@ -#ifndef __SPARC64_SYSTEM_H -#define __SPARC64_SYSTEM_H - -#include -#include -#include - -#ifndef __ASSEMBLY__ - -#include -#include - -/* - * Sparc (general) CPU types - */ -enum sparc_cpu { - sun4 = 0x00, - sun4c = 0x01, - sun4m = 0x02, - sun4d = 0x03, - sun4e = 0x04, - sun4u = 0x05, /* V8 ploos ploos */ - sun_unknown = 0x06, - ap1000 = 0x07, /* almost a sun4m */ -}; - -#define sparc_cpu_model sun4u - -/* This cannot ever be a sun4c :) That's just history. */ -#define ARCH_SUN4C 0 - -extern char reboot_command[]; - -/* These are here in an effort to more fully work around Spitfire Errata - * #51. Essentially, if a memory barrier occurs soon after a mispredicted - * branch, the chip can stop executing instructions until a trap occurs. - * Therefore, if interrupts are disabled, the chip can hang forever. - * - * It used to be believed that the memory barrier had to be right in the - * delay slot, but a case has been traced recently wherein the memory barrier - * was one instruction after the branch delay slot and the chip still hung. - * The offending sequence was the following in sym_wakeup_done() of the - * sym53c8xx_2 driver: - * - * call sym_ccb_from_dsa, 0 - * movge %icc, 0, %l0 - * brz,pn %o0, .LL1303 - * mov %o0, %l2 - * membar #LoadLoad - * - * The branch has to be mispredicted for the bug to occur. Therefore, we put - * the memory barrier explicitly into a "branch always, predicted taken" - * delay slot to avoid the problem case. - */ -#define membar_safe(type) \ -do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ - " membar " type "\n" \ - "1:\n" \ - : : : "memory"); \ -} while (0) - -/* The kernel always executes in TSO memory model these days, - * and furthermore most sparc64 chips implement more stringent - * memory ordering than required by the specifications. - */ -#define mb() membar_safe("#StoreLoad") -#define rmb() __asm__ __volatile__("":::"memory") -#define wmb() __asm__ __volatile__("":::"memory") - -#endif - -#define nop() __asm__ __volatile__ ("nop") - -#define read_barrier_depends() do { } while(0) -#define set_mb(__var, __value) \ - do { __var = __value; membar_safe("#StoreLoad"); } while(0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#define smp_rmb() rmb() -#define smp_wmb() wmb() -#else -#define smp_mb() __asm__ __volatile__("":::"memory") -#define smp_rmb() __asm__ __volatile__("":::"memory") -#define smp_wmb() __asm__ __volatile__("":::"memory") -#endif - -#define smp_read_barrier_depends() do { } while(0) - -#define flushi(addr) __asm__ __volatile__ ("flush %0" : : "r" (addr) : "memory") - -#define flushw_all() __asm__ __volatile__("flushw") - -/* Performance counter register access. */ -#define read_pcr(__p) __asm__ __volatile__("rd %%pcr, %0" : "=r" (__p)) -#define write_pcr(__p) __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (__p)) -#define read_pic(__p) __asm__ __volatile__("rd %%pic, %0" : "=r" (__p)) - -/* Blackbird errata workaround. See commentary in - * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt() - * for more information. - */ -#define write_pic(__p) \ - __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" \ - " nop\n\t" \ - ".align 64\n" \ - "99:wr %0, 0x0, %%pic\n\t" \ - "rd %%pic, %%g0" : : "r" (__p)) -#define reset_pic() write_pic(0) - -#ifndef __ASSEMBLY__ - -extern void sun_do_break(void); -extern int stop_a_enabled; -extern int scons_pwroff; - -extern void fault_in_user_windows(void); -extern void synchronize_user_stack(void); - -extern void __flushw_user(void); -#define flushw_user() __flushw_user() - -#define flush_user_windows flushw_user -#define flush_register_windows flushw_all - -/* Don't hold the runqueue lock over context switch */ -#define __ARCH_WANT_UNLOCKED_CTXSW -#define prepare_arch_switch(next) \ -do { \ - flushw_all(); \ -} while (0) - - /* See what happens when you design the chip correctly? - * - * We tell gcc we clobber all non-fixed-usage registers except - * for l0/l1. It will use one for 'next' and the other to hold - * the output value of 'last'. 'next' is not referenced again - * past the invocation of switch_to in the scheduler, so we need - * not preserve it's value. Hairy, but it lets us remove 2 loads - * and 2 stores in this critical code path. -DaveM - */ -#define switch_to(prev, next, last) \ -do { flush_tlb_pending(); \ - save_and_clear_fpu(); \ - /* If you are tempted to conditionalize the following */ \ - /* so that ASI is only written if it changes, think again. */ \ - __asm__ __volatile__("wr %%g0, %0, %%asi" \ - : : "r" (__thread_flag_byte_ptr(task_thread_info(next))[TI_FLAG_BYTE_CURRENT_DS]));\ - trap_block[current_thread_info()->cpu].thread = \ - task_thread_info(next); \ - __asm__ __volatile__( \ - "mov %%g4, %%g7\n\t" \ - "stx %%i6, [%%sp + 2047 + 0x70]\n\t" \ - "stx %%i7, [%%sp + 2047 + 0x78]\n\t" \ - "rdpr %%wstate, %%o5\n\t" \ - "stx %%o6, [%%g6 + %6]\n\t" \ - "stb %%o5, [%%g6 + %5]\n\t" \ - "rdpr %%cwp, %%o5\n\t" \ - "stb %%o5, [%%g6 + %8]\n\t" \ - "wrpr %%g0, 15, %%pil\n\t" \ - "mov %4, %%g6\n\t" \ - "ldub [%4 + %8], %%g1\n\t" \ - "wrpr %%g1, %%cwp\n\t" \ - "ldx [%%g6 + %6], %%o6\n\t" \ - "ldub [%%g6 + %5], %%o5\n\t" \ - "ldub [%%g6 + %7], %%o7\n\t" \ - "wrpr %%o5, 0x0, %%wstate\n\t" \ - "ldx [%%sp + 2047 + 0x70], %%i6\n\t" \ - "ldx [%%sp + 2047 + 0x78], %%i7\n\t" \ - "ldx [%%g6 + %9], %%g4\n\t" \ - "wrpr %%g0, 14, %%pil\n\t" \ - "brz,pt %%o7, switch_to_pc\n\t" \ - " mov %%g7, %0\n\t" \ - "sethi %%hi(ret_from_syscall), %%g1\n\t" \ - "jmpl %%g1 + %%lo(ret_from_syscall), %%g0\n\t" \ - " nop\n\t" \ - ".globl switch_to_pc\n\t" \ - "switch_to_pc:\n\t" \ - : "=&r" (last), "=r" (current), "=r" (current_thread_info_reg), \ - "=r" (__local_per_cpu_offset) \ - : "0" (task_thread_info(next)), \ - "i" (TI_WSTATE), "i" (TI_KSP), "i" (TI_NEW_CHILD), \ - "i" (TI_CWP), "i" (TI_TASK) \ - : "cc", \ - "g1", "g2", "g3", "g7", \ - "l1", "l2", "l3", "l4", "l5", "l6", "l7", \ - "i0", "i1", "i2", "i3", "i4", "i5", \ - "o0", "o1", "o2", "o3", "o4", "o5", "o7"); \ -} while(0) - -static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) -{ - unsigned long tmp1, tmp2; - - __asm__ __volatile__( -" mov %0, %1\n" -"1: lduw [%4], %2\n" -" cas [%4], %2, %0\n" -" cmp %2, %0\n" -" bne,a,pn %%icc, 1b\n" -" mov %1, %0\n" - : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2) - : "0" (val), "r" (m) - : "cc", "memory"); - return val; -} - -static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long val) -{ - unsigned long tmp1, tmp2; - - __asm__ __volatile__( -" mov %0, %1\n" -"1: ldx [%4], %2\n" -" casx [%4], %2, %0\n" -" cmp %2, %0\n" -" bne,a,pn %%xcc, 1b\n" -" mov %1, %0\n" - : "=&r" (val), "=&r" (tmp1), "=&r" (tmp2) - : "0" (val), "r" (m) - : "cc", "memory"); - return val; -} - -#define xchg(ptr,x) ((__typeof__(*(ptr)))__xchg((unsigned long)(x),(ptr),sizeof(*(ptr)))) - -extern void __xchg_called_with_bad_pointer(void); - -static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, - int size) -{ - switch (size) { - case 4: - return xchg32(ptr, x); - case 8: - return xchg64(ptr, x); - } - __xchg_called_with_bad_pointer(); - return x; -} - -extern void die_if_kernel(char *str, struct pt_regs *regs) __attribute__ ((noreturn)); - -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ - -#define __HAVE_ARCH_CMPXCHG 1 - -static inline unsigned long -__cmpxchg_u32(volatile int *m, int old, int new) -{ - __asm__ __volatile__("cas [%2], %3, %0" - : "=&r" (new) - : "0" (new), "r" (m), "r" (old) - : "memory"); - - return new; -} - -static inline unsigned long -__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new) -{ - __asm__ __volatile__("casx [%2], %3, %0" - : "=&r" (new) - : "0" (new), "r" (m), "r" (old) - : "memory"); - - return new; -} - -/* This function doesn't exist, so you'll get a linker error - if something tries to do an invalid cmpxchg(). */ -extern void __cmpxchg_called_with_bad_pointer(void); - -static inline unsigned long -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) -{ - switch (size) { - case 4: - return __cmpxchg_u32(ptr, old, new); - case 8: - return __cmpxchg_u64(ptr, old, new); - } - __cmpxchg_called_with_bad_pointer(); - return old; -} - -#define cmpxchg(ptr,o,n) \ - ({ \ - __typeof__(*(ptr)) _o_ = (o); \ - __typeof__(*(ptr)) _n_ = (n); \ - (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ - (unsigned long)_n_, sizeof(*(ptr))); \ - }) - -/* - * cmpxchg_local and cmpxchg64_local are atomic wrt current CPU. Always make - * them available. - */ - -static inline unsigned long __cmpxchg_local(volatile void *ptr, - unsigned long old, - unsigned long new, int size) -{ - switch (size) { - case 4: - case 8: return __cmpxchg(ptr, old, new, size); - default: - return __cmpxchg_local_generic(ptr, old, new, size); - } - - return old; -} - -#define cmpxchg_local(ptr, o, n) \ - ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \ - (unsigned long)(n), sizeof(*(ptr)))) -#define cmpxchg64_local(ptr, o, n) \ - ({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - cmpxchg_local((ptr), (o), (n)); \ - }) - -#endif /* !(__ASSEMBLY__) */ - -#define arch_align_stack(x) (x) - -#endif /* !(__SPARC64_SYSTEM_H) */ diff --git a/arch/sparc/include/asm/timer_32.h b/arch/sparc/include/asm/timer_32.h index 2ec030ef3810..1a91e11dd104 100644 --- a/arch/sparc/include/asm/timer_32.h +++ b/arch/sparc/include/asm/timer_32.h @@ -8,12 +8,13 @@ #ifndef _SPARC_TIMER_H #define _SPARC_TIMER_H -#include /* For SUN4M_NCPUS */ +#include /* For SUN4M_NCPUS */ #include extern __volatile__ unsigned int *master_l10_counter; /* FIXME: Make do_[gs]ettimeofday btfixup calls */ +struct timespec; BTFIXUPDEF_CALL(int, bus_do_settimeofday, struct timespec *tv) #define bus_do_settimeofday(tv) BTFIXUP_CALL(bus_do_settimeofday)(tv) diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h index 3e1449f07798..a1091afb8831 100644 --- a/arch/sparc/include/asm/uaccess_64.h +++ b/arch/sparc/include/asm/uaccess_64.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #endif diff --git a/arch/sparc/kernel/auxio_32.c b/arch/sparc/kernel/auxio_32.c index f7ea8f032719..56d0f52c3e62 100644 --- a/arch/sparc/kernel/auxio_32.c +++ b/arch/sparc/kernel/auxio_32.c @@ -13,6 +13,7 @@ #include #include #include /* memset(), Linux has no bzero() */ +#include /* Probe and map in the Auxiliary I/O register */ diff --git a/arch/sparc/kernel/devices.c b/arch/sparc/kernel/devices.c index 113c052c3043..6b2f56a6f8af 100644 --- a/arch/sparc/kernel/devices.c +++ b/arch/sparc/kernel/devices.c @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include extern void clock_stop_probe(void); /* tadpole.c */ extern void sun4c_probe_memerr_reg(void); diff --git a/arch/sparc/kernel/irq.h b/arch/sparc/kernel/irq.h index 42851122bbd9..5a021dd2f854 100644 --- a/arch/sparc/kernel/irq.h +++ b/arch/sparc/kernel/irq.h @@ -1,6 +1,7 @@ #include #include +#include struct irq_bucket { struct irq_bucket *next; diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index d45b710ea7e4..dff2c3d7d370 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/kgdb_32.c b/arch/sparc/kernel/kgdb_32.c index 539243b236fa..2e424a576a36 100644 --- a/arch/sparc/kernel/kgdb_32.c +++ b/arch/sparc/kernel/kgdb_32.c @@ -9,6 +9,7 @@ #include #include #include +#include extern unsigned long trapbase; diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index e5519870c3d9..276359e1ff56 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -16,6 +16,7 @@ #include #include +#include #include "entry.h" diff --git a/arch/sparc/kernel/muldiv.c b/arch/sparc/kernel/muldiv.c index 6ce1021d487c..f7db516b07d8 100644 --- a/arch/sparc/kernel/muldiv.c +++ b/arch/sparc/kernel/muldiv.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include "kernel.h" diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index c76fe0b5bd94..eb1c1f010a47 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "kstack.h" diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index a24072a49270..0ce0dd2332aa 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -14,6 +14,7 @@ #include #include #include +#include /* This code is shared between various users of the performance * counters. Users will be oprofile, pseudo-NMI watchdog, and the diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 8e16a4a21582..28559ce5eeb5 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -25,6 +25,8 @@ #include #include #include +#include +#include #include "kernel.h" #include "kstack.h" diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 935fdbcd88c2..efa07542e85f 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -38,6 +37,7 @@ #include #include #include +#include /* * Power management idle function diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 06b5b5fc20c7..aff0c72fac09 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -32,7 +32,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c index 27b9e93d0121..896ba7c5cd8e 100644 --- a/arch/sparc/kernel/ptrace_32.c +++ b/arch/sparc/kernel/ptrace_32.c @@ -23,8 +23,8 @@ #include #include -#include #include +#include /* #define ALLOW_INIT_TRACING */ diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index 9388844cd88c..6f97c0767995 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -29,7 +29,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/reboot.c b/arch/sparc/kernel/reboot.c index 006a42dd2007..eba7d918162a 100644 --- a/arch/sparc/kernel/reboot.c +++ b/arch/sparc/kernel/reboot.c @@ -7,9 +7,9 @@ #include #include -#include #include #include +#include /* sysctl - toggle power-off restriction for serial console * systems in machine_power_off() diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index ffb883ddd0f0..d444468b27f6 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -33,7 +33,6 @@ #include #include -#include #include #include #include @@ -46,6 +45,7 @@ #include #include #include +#include #include "kernel.h" diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index a854a1c240ff..1414d16712b2 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -31,7 +31,6 @@ #include #include -#include #include #include #include @@ -49,6 +48,7 @@ #include #include #include +#include #ifdef CONFIG_IP_PNP #include diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index c8f5b50db89c..948700fb9036 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "sigutil.h" diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 7bb71b6fbd20..1e750e415d7a 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -25,6 +25,7 @@ #include #include #include /* flush_sig_insns */ +#include #include "sigutil.h" diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index d8a67e60be80..48b0f57b65f7 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include "entry.h" #include "systbls.h" diff --git a/arch/sparc/kernel/sigutil_32.c b/arch/sparc/kernel/sigutil_32.c index 35c7897b009a..0f6eebe71e6c 100644 --- a/arch/sparc/kernel/sigutil_32.c +++ b/arch/sparc/kernel/sigutil_32.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "sigutil.h" diff --git a/arch/sparc/kernel/sigutil_64.c b/arch/sparc/kernel/sigutil_64.c index b19570d41a39..387834a9c56a 100644 --- a/arch/sparc/kernel/sigutil_64.c +++ b/arch/sparc/kernel/sigutil_64.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "sigutil.h" diff --git a/arch/sparc/kernel/sparc_ksyms_64.c b/arch/sparc/kernel/sparc_ksyms_64.c index 12ff09824cd9..9f5e24ddcc70 100644 --- a/arch/sparc/kernel/sparc_ksyms_64.c +++ b/arch/sparc/kernel/sparc_ksyms_64.c @@ -10,12 +10,12 @@ #include #include -#include #include #include #include #include #include +#include struct poll { int fd; diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 1060e0672a4b..7d0c088e8aba 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index 591f20ca9e48..d2de21333146 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 0cbdaa41cd1e..c72fdf55e1c1 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -41,6 +40,7 @@ #include #include #include +#include #include "entry.h" #include "kstack.h" diff --git a/arch/sparc/kernel/unaligned_32.c b/arch/sparc/kernel/unaligned_32.c index 4d043a1b2492..c0ec89786193 100644 --- a/arch/sparc/kernel/unaligned_32.c +++ b/arch/sparc/kernel/unaligned_32.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/unaligned_64.c b/arch/sparc/kernel/unaligned_64.c index 76e4ac1a13e1..dae85bc2eda5 100644 --- a/arch/sparc/kernel/unaligned_64.c +++ b/arch/sparc/kernel/unaligned_64.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include @@ -24,6 +23,7 @@ #include #include #include +#include enum direction { load, /* ld, ldd, ldh, ldsh */ diff --git a/arch/sparc/kernel/visemul.c b/arch/sparc/kernel/visemul.c index 73370674ccff..08e074b7eb6a 100644 --- a/arch/sparc/kernel/visemul.c +++ b/arch/sparc/kernel/visemul.c @@ -9,9 +9,9 @@ #include #include -#include #include #include +#include /* OPF field of various VIS instructions. */ diff --git a/arch/sparc/math-emu/math_64.c b/arch/sparc/math-emu/math_64.c index e575bd2fe381..2bbe2f28ad23 100644 --- a/arch/sparc/math-emu/math_64.c +++ b/arch/sparc/math-emu/math_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "sfp-util_64.h" #include diff --git a/arch/sparc/mm/btfixup.c b/arch/sparc/mm/btfixup.c index 8a7f81743c12..09d6af22db2d 100644 --- a/arch/sparc/mm/btfixup.c +++ b/arch/sparc/mm/btfixup.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #define BTFIXUP_OPTIMIZE_NOP diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c index 8023fd7e77b5..7705c6731e28 100644 --- a/arch/sparc/mm/fault_32.c +++ b/arch/sparc/mm/fault_32.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 7b00de61c5f1..c5f9021b1a01 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -27,7 +27,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index b3f5e7dfea51..21faaeea85de 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -28,7 +28,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/mm/loadmmu.c b/arch/sparc/mm/loadmmu.c index 82ec8f666036..c5bf2a6c3858 100644 --- a/arch/sparc/mm/loadmmu.c +++ b/arch/sparc/mm/loadmmu.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/mm/tsb.c b/arch/sparc/mm/tsb.c index 536412d8f416..c52add79b83d 100644 --- a/arch/sparc/mm/tsb.c +++ b/arch/sparc/mm/tsb.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/prom/console_32.c b/arch/sparc/prom/console_32.c index a00f47b16c10..1cfb50f4cb9c 100644 --- a/arch/sparc/prom/console_32.c +++ b/arch/sparc/prom/console_32.c @@ -11,7 +11,6 @@ #include #include #include -#include #include extern void restore_current(void); diff --git a/arch/sparc/prom/console_64.c b/arch/sparc/prom/console_64.c index 9de6c8cfe04a..f95edcc54fd5 100644 --- a/arch/sparc/prom/console_64.c +++ b/arch/sparc/prom/console_64.c @@ -10,7 +10,6 @@ #include #include #include -#include #include static int __prom_console_write_buf(const char *buf, int len) diff --git a/arch/sparc/prom/misc_32.c b/arch/sparc/prom/misc_32.c index 677b6a10fbde..8dc0b6b271e8 100644 --- a/arch/sparc/prom/misc_32.c +++ b/arch/sparc/prom/misc_32.c @@ -13,7 +13,6 @@ #include #include #include -#include extern void restore_current(void); diff --git a/arch/sparc/prom/misc_64.c b/arch/sparc/prom/misc_64.c index e4f31d4d3715..f178b9dcc7b7 100644 --- a/arch/sparc/prom/misc_64.c +++ b/arch/sparc/prom/misc_64.c @@ -15,7 +15,6 @@ #include #include -#include #include static int prom_service_exists(const char *service_name) diff --git a/arch/sparc/prom/p1275.c b/arch/sparc/prom/p1275.c index d9850c2b9bf2..04a4540509dd 100644 --- a/arch/sparc/prom/p1275.c +++ b/arch/sparc/prom/p1275.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/arch/sparc/prom/ranges.c b/arch/sparc/prom/ranges.c index 0857aa9e839d..ad143c13bdc0 100644 --- a/arch/sparc/prom/ranges.c +++ b/arch/sparc/prom/ranges.c @@ -11,7 +11,6 @@ #include #include #include -#include static struct linux_prom_ranges promlib_obio_ranges[PROMREG_MAX]; static int num_obio_ranges; diff --git a/drivers/tty/serial/sunhv.c b/drivers/tty/serial/sunhv.c index 3ba5d285c2d0..505961cfd934 100644 --- a/drivers/tty/serial/sunhv.c +++ b/drivers/tty/serial/sunhv.c @@ -23,6 +23,7 @@ #include #include #include +#include #if defined(CONFIG_MAGIC_SYSRQ) #define SUPPORT_SYSRQ diff --git a/drivers/tty/serial/sunsab.c b/drivers/tty/serial/sunsab.c index 62dacd0ba526..f0d93eb7e6ec 100644 --- a/drivers/tty/serial/sunsab.c +++ b/drivers/tty/serial/sunsab.c @@ -37,6 +37,7 @@ #include #include #include +#include #if defined(CONFIG_SERIAL_SUNSAB_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) #define SUPPORT_SYSRQ diff --git a/drivers/tty/serial/sunsu.c b/drivers/tty/serial/sunsu.c index d3ca6da129fe..675303b8ed84 100644 --- a/drivers/tty/serial/sunsu.c +++ b/drivers/tty/serial/sunsu.c @@ -41,6 +41,7 @@ #include #include #include +#include #if defined(CONFIG_SERIAL_SUNSU_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) #define SUPPORT_SYSRQ diff --git a/drivers/tty/serial/sunzilog.c b/drivers/tty/serial/sunzilog.c index da4415842a43..b3b70b0bf85b 100644 --- a/drivers/tty/serial/sunzilog.c +++ b/drivers/tty/serial/sunzilog.c @@ -37,6 +37,7 @@ #include #include #include +#include #if defined(CONFIG_SERIAL_SUNZILOG_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) #define SUPPORT_SYSRQ diff --git a/kernel/signal.c b/kernel/signal.c index e76001ccf5cd..5120f1901f36 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "audit.h" /* audit_signal_info() */ /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 11d53046b905..04402ab7a046 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,6 +68,9 @@ #include #include #endif +#ifdef CONFIG_SPARC +#include +#endif #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif -- cgit v1.2.3 From 96f951edb1f1bdbbc99b0cd458f9808bb83d58ae Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Add #includes needed to permit the removal of asm/system.h asm/system.h is a cause of circular dependency problems because it contains commonly used primitive stuff like barrier definitions and uncommonly used stuff like switch_to() that might require MMU definitions. asm/system.h has been disintegrated by this point on all arches into the following common segments: (1) asm/barrier.h Moved memory barrier definitions here. (2) asm/cmpxchg.h Moved xchg() and cmpxchg() here. #included in asm/atomic.h. (3) asm/bug.h Moved die() and similar here. (4) asm/exec.h Moved arch_align_stack() here. (5) asm/elf.h Moved AT_VECTOR_SIZE_ARCH here. (6) asm/switch_to.h Moved switch_to() here. Signed-off-by: David Howells --- drivers/misc/sgi-gru/gru_instructions.h | 1 + drivers/staging/crystalhd/bc_dts_defs.h | 2 ++ fs/binfmt_elf.c | 1 + fs/binfmt_elf_fdpic.c | 1 + fs/exec.c | 1 + include/asm-generic/bitops/atomic.h | 2 +- include/linux/llist.h | 3 +-- include/linux/mtd/map.h | 1 + include/linux/spinlock.h | 1 + kernel/sched/core.c | 1 + 10 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/drivers/misc/sgi-gru/gru_instructions.h b/drivers/misc/sgi-gru/gru_instructions.h index d95587cc794c..04d5170ac149 100644 --- a/drivers/misc/sgi-gru/gru_instructions.h +++ b/drivers/misc/sgi-gru/gru_instructions.h @@ -40,6 +40,7 @@ extern void gru_wait_abort_proc(void *cb); *((volatile unsigned long *)(p)) = v; /* force st.rel */ \ } while (0) #elif defined(CONFIG_X86_64) +#include #define __flush_cache(p) clflush(p) #define gru_ordered_store_ulong(p, v) \ do { \ diff --git a/drivers/staging/crystalhd/bc_dts_defs.h b/drivers/staging/crystalhd/bc_dts_defs.h index 8cd51a7aad8e..647e116e10de 100644 --- a/drivers/staging/crystalhd/bc_dts_defs.h +++ b/drivers/staging/crystalhd/bc_dts_defs.h @@ -26,6 +26,8 @@ #ifndef _BC_DTS_DEFS_H_ #define _BC_DTS_DEFS_H_ +#include + /* BIT Mask */ #define BC_BIT(_x) (1 << (_x)) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 81878b78c9d4..18276531f7c6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -35,6 +35,7 @@ #include #include #include +#include static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs); static int load_elf_library(struct file *); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c64bf5ee2df4..9bd5612a8224 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -39,6 +39,7 @@ #include #include #include +#include typedef char *elf_caddr_t; diff --git a/fs/exec.c b/fs/exec.c index 23559c227d9c..c8b63d14da85 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index ecc44a8e2b44..9ae6c34dc191 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -2,7 +2,7 @@ #define _ASM_GENERIC_BITOPS_ATOMIC_H_ #include -#include +#include #ifdef CONFIG_SMP #include diff --git a/include/linux/llist.h b/include/linux/llist.h index 801b44b07aac..a5199f6d0e82 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -56,8 +56,7 @@ */ #include -#include -#include +#include struct llist_head { struct llist_node *first; diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index 94e924e2ecd5..ade5c990f1f0 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_MTD_MAP_BANK_WIDTH_1 #define map_bankwidth(map) 1 diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 7df6c17b0281..fa0f93e4d86d 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -55,6 +55,7 @@ #include #include #include +#include #include diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 503d6426126d..157fb9b2b186 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -73,6 +73,7 @@ #include #include +#include #include #include #include -- cgit v1.2.3 From 9ffc93f203c18a70623f21950f1dd473c9ec48cd Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Mar 2012 18:30:03 +0100 Subject: Remove all #inclusions of asm/system.h Remove all #inclusions of asm/system.h preparatory to splitting and killing it. Performed with the following command: perl -p -i -e 's!^#\s*include\s*.*\n!!' `grep -Irl '^#\s*include\s*' *` Signed-off-by: David Howells --- arch/um/include/asm/fixmap.h | 1 - drivers/acpi/processor_driver.c | 1 - drivers/atm/eni.c | 1 - drivers/atm/firestream.c | 1 - drivers/atm/horizon.c | 1 - drivers/atm/idt77105.c | 1 - drivers/atm/iphase.c | 1 - drivers/atm/suni.c | 1 - drivers/atm/zatm.c | 1 - drivers/block/floppy.c | 1 - drivers/block/hd.c | 1 - drivers/block/nbd.c | 1 - drivers/block/xd.c | 1 - drivers/bluetooth/bt3c_cs.c | 1 - drivers/bluetooth/btuart_cs.c | 1 - drivers/bluetooth/dtl1_cs.c | 1 - drivers/char/apm-emulation.c | 1 - drivers/char/ds1302.c | 1 - drivers/char/efirtc.c | 1 - drivers/char/genrtc.c | 1 - drivers/char/hpet.c | 1 - drivers/char/ipmi/ipmi_devintf.c | 1 - drivers/char/ipmi/ipmi_msghandler.c | 1 - drivers/char/ipmi/ipmi_si_intf.c | 1 - drivers/char/lp.c | 1 - drivers/char/mbcs.c | 1 - drivers/char/mspec.c | 1 - drivers/char/mwave/3780i.c | 1 - drivers/char/nvram.c | 1 - drivers/char/nwflash.c | 1 - drivers/char/pcmcia/synclink_cs.c | 1 - drivers/char/rtc.c | 1 - drivers/char/sonypi.c | 1 - drivers/char/xilinx_hwicap/xilinx_hwicap.c | 1 - drivers/cpufreq/omap-cpufreq.c | 1 - drivers/cpufreq/powernow-k7.c | 1 - drivers/firewire/core-cdev.c | 1 - drivers/firewire/core-device.c | 1 - drivers/firewire/core-topology.c | 1 - drivers/firewire/ohci.c | 1 - drivers/firewire/sbp2.c | 1 - drivers/i2c/busses/i2c-acorn.c | 1 - drivers/ide/ide-cs.c | 1 - drivers/ide/qd65xx.c | 1 - drivers/infiniband/hw/ehca/ehca_reqs.c | 1 - drivers/input/joydev.c | 1 - drivers/input/joystick/amijoy.c | 1 - drivers/input/mouse/amimouse.c | 1 - drivers/input/mouse/atarimouse.c | 1 - drivers/input/serio/hp_sdc.c | 1 - drivers/input/serio/maceps2.c | 1 - drivers/input/serio/rpckbd.c | 1 - drivers/input/serio/sa1111ps2.c | 1 - drivers/isdn/hardware/avm/avm_cs.c | 1 - drivers/isdn/hisax/avma1_cs.c | 1 - drivers/isdn/hisax/elsa_cs.c | 1 - drivers/isdn/hisax/sedlbauer_cs.c | 1 - drivers/isdn/hisax/teles_cs.c | 1 - drivers/isdn/i4l/isdn_bsdcomp.c | 1 - drivers/isdn/pcbit/layer2.c | 1 - drivers/macintosh/macio-adb.c | 1 - drivers/macintosh/therm_adt746x.c | 1 - drivers/macintosh/therm_pm72.c | 1 - drivers/macintosh/therm_windtunnel.c | 1 - drivers/macintosh/via-cuda.c | 1 - drivers/macintosh/via-macii.c | 1 - drivers/macintosh/via-pmu.c | 1 - drivers/macintosh/via-pmu68k.c | 1 - drivers/macintosh/windfarm_lm75_sensor.c | 1 - drivers/macintosh/windfarm_pm121.c | 1 - drivers/macintosh/windfarm_pm81.c | 1 - drivers/macintosh/windfarm_pm91.c | 1 - drivers/macintosh/windfarm_smu_controls.c | 1 - drivers/macintosh/windfarm_smu_sensors.c | 1 - drivers/media/dvb/dvb-core/dmxdev.c | 1 - drivers/media/dvb/firewire/firedtv-fw.c | 1 - drivers/media/dvb/ttpci/av7110.c | 1 - drivers/media/media-devnode.c | 1 - drivers/media/video/ivtv/ivtv-driver.h | 1 - drivers/media/video/v4l2-common.c | 1 - drivers/media/video/v4l2-dev.c | 1 - drivers/message/i2o/i2o_scsi.c | 1 - drivers/mfd/mcp-core.c | 1 - drivers/mfd/mcp-sa11x0.c | 1 - drivers/misc/sgi-xp/xp.h | 1 - drivers/mmc/card/block.c | 1 - drivers/mtd/devices/pmc551.c | 1 - drivers/mtd/devices/slram.c | 1 - drivers/mtd/maps/pcmciamtd.c | 1 - drivers/mtd/nand/bcm_umi_nand.c | 1 - drivers/net/appletalk/cops.c | 1 - drivers/net/appletalk/ltpc.c | 1 - drivers/net/arcnet/com20020_cs.c | 1 - drivers/net/bonding/bond_main.c | 1 - drivers/net/can/slcan.c | 1 - drivers/net/cris/eth_v10.c | 1 - drivers/net/ethernet/3com/3c574_cs.c | 1 - drivers/net/ethernet/3com/3c589_cs.c | 1 - drivers/net/ethernet/8390/3c503.c | 1 - drivers/net/ethernet/8390/ac3200.c | 1 - drivers/net/ethernet/8390/apne.c | 1 - drivers/net/ethernet/8390/ax88796.c | 1 - drivers/net/ethernet/8390/axnet_cs.c | 1 - drivers/net/ethernet/8390/e2100.c | 1 - drivers/net/ethernet/8390/es3210.c | 1 - drivers/net/ethernet/8390/etherh.c | 1 - drivers/net/ethernet/8390/hp-plus.c | 1 - drivers/net/ethernet/8390/hp.c | 1 - drivers/net/ethernet/8390/lib8390.c | 1 - drivers/net/ethernet/8390/lne390.c | 1 - drivers/net/ethernet/8390/mac8390.c | 1 - drivers/net/ethernet/8390/ne-h8300.c | 1 - drivers/net/ethernet/8390/ne.c | 1 - drivers/net/ethernet/8390/ne2.c | 1 - drivers/net/ethernet/8390/ne2k-pci.c | 1 - drivers/net/ethernet/8390/ne3210.c | 1 - drivers/net/ethernet/8390/pcnet_cs.c | 1 - drivers/net/ethernet/8390/smc-mca.c | 1 - drivers/net/ethernet/8390/smc-ultra.c | 1 - drivers/net/ethernet/8390/smc-ultra32.c | 1 - drivers/net/ethernet/8390/stnic.c | 1 - drivers/net/ethernet/8390/wd.c | 1 - drivers/net/ethernet/8390/zorro8390.c | 1 - drivers/net/ethernet/alteon/acenic.c | 1 - drivers/net/ethernet/amd/7990.c | 1 - drivers/net/ethernet/amd/am79c961a.c | 1 - drivers/net/ethernet/amd/amd8111e.c | 1 - drivers/net/ethernet/amd/declance.c | 1 - drivers/net/ethernet/amd/hplance.c | 1 - drivers/net/ethernet/amd/mvme147.c | 1 - drivers/net/ethernet/amd/nmclan_cs.c | 1 - drivers/net/ethernet/amd/sunlance.c | 1 - drivers/net/ethernet/broadcom/tg3.c | 1 - drivers/net/ethernet/cirrus/cs89x0.c | 1 - drivers/net/ethernet/cirrus/mac89x0.c | 1 - drivers/net/ethernet/dlink/de600.c | 1 - drivers/net/ethernet/dlink/de620.c | 1 - drivers/net/ethernet/fujitsu/at1700.c | 1 - drivers/net/ethernet/fujitsu/eth16i.c | 1 - drivers/net/ethernet/fujitsu/fmvj18x_cs.c | 1 - drivers/net/ethernet/i825xx/3c507.c | 1 - drivers/net/ethernet/i825xx/3c527.c | 1 - drivers/net/ethernet/i825xx/eepro.c | 1 - drivers/net/ethernet/i825xx/eexpress.c | 1 - drivers/net/ethernet/i825xx/ether1.c | 1 - drivers/net/ethernet/i825xx/znet.c | 1 - drivers/net/ethernet/korina.c | 1 - drivers/net/ethernet/marvell/mv643xx_eth.c | 1 - drivers/net/ethernet/marvell/pxa168_eth.c | 1 - drivers/net/ethernet/natsemi/jazzsonic.c | 1 - drivers/net/ethernet/natsemi/macsonic.c | 1 - drivers/net/ethernet/natsemi/ns83820.c | 1 - drivers/net/ethernet/neterion/s2io.c | 1 - drivers/net/ethernet/nvidia/forcedeth.c | 1 - drivers/net/ethernet/realtek/atp.c | 1 - drivers/net/ethernet/realtek/r8169.c | 1 - drivers/net/ethernet/seeq/ether3.c | 1 - drivers/net/ethernet/seeq/seeq8005.c | 1 - drivers/net/ethernet/smsc/smc91c92_cs.c | 1 - drivers/net/ethernet/sun/cassini.c | 1 - drivers/net/ethernet/sun/sunbmac.c | 1 - drivers/net/ethernet/sun/sungem.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/net/ethernet/sun/sunqe.c | 1 - drivers/net/ethernet/tundra/tsi108_eth.c | 1 - drivers/net/ethernet/xircom/xirc2ps_cs.c | 1 - drivers/net/hamradio/6pack.c | 1 - drivers/net/hamradio/baycom_par.c | 1 - drivers/net/hamradio/bpqether.c | 1 - drivers/net/hamradio/mkiss.c | 1 - drivers/net/hamradio/scc.c | 1 - drivers/net/hamradio/yam.c | 1 - drivers/net/hippi/rrunner.c | 1 - drivers/net/irda/donauboe.c | 1 - drivers/net/loopback.c | 1 - drivers/net/plip/plip.c | 1 - drivers/net/slip/slhc.c | 1 - drivers/net/slip/slip.c | 1 - drivers/net/tokenring/3c359.c | 1 - drivers/net/tokenring/abyss.c | 1 - drivers/net/tokenring/ibmtr_cs.c | 1 - drivers/net/tokenring/lanstreamer.c | 1 - drivers/net/tokenring/madgemc.c | 1 - drivers/net/tokenring/olympic.c | 1 - drivers/net/tokenring/proteon.c | 1 - drivers/net/tokenring/skisa.c | 1 - drivers/net/tokenring/smctr.c | 1 - drivers/net/tokenring/tms380tr.c | 1 - drivers/net/tokenring/tmspci.c | 1 - drivers/net/tun.c | 1 - drivers/net/wan/dlci.c | 1 - drivers/net/wan/dscc4.c | 1 - drivers/net/wan/hd64570.c | 1 - drivers/net/wan/hd64572.c | 1 - drivers/net/wan/lapbether.c | 1 - drivers/net/wan/sdla.c | 1 - drivers/net/wan/x25_asy.c | 1 - drivers/net/wireless/airo.c | 1 - drivers/net/wireless/airo_cs.c | 1 - drivers/net/wireless/atmel.c | 1 - drivers/net/wireless/atmel_cs.c | 1 - drivers/net/wireless/prism54/islpci_mgt.c | 1 - drivers/net/wireless/ray_cs.c | 1 - drivers/net/wireless/wl3501_cs.c | 1 - drivers/nubus/nubus.c | 1 - drivers/parisc/dino.c | 1 - drivers/parisc/iosapic.c | 1 - drivers/parisc/lba_pci.c | 1 - drivers/pcmcia/cs.c | 1 - drivers/pcmcia/i82092.c | 1 - drivers/pcmcia/i82365.c | 1 - drivers/pcmcia/m32r_cfc.c | 1 - drivers/pcmcia/m32r_pcc.c | 1 - drivers/pcmcia/m8xx_pcmcia.c | 1 - drivers/pcmcia/pd6729.c | 1 - drivers/pcmcia/pxa2xx_base.c | 1 - drivers/pcmcia/sa11xx_base.c | 1 - drivers/pcmcia/soc_common.c | 1 - drivers/pcmcia/socket_sysfs.c | 1 - drivers/pcmcia/tcic.c | 1 - drivers/pcmcia/xxs1500_ss.c | 1 - drivers/pnp/pnpbios/bioscalls.c | 1 - drivers/pnp/pnpbios/core.c | 1 - drivers/s390/crypto/ap_bus.c | 1 - drivers/sbus/char/flash.c | 1 - drivers/sbus/char/openprom.c | 1 - drivers/sbus/char/uctrl.c | 1 - drivers/scsi/53c700.c | 1 - drivers/scsi/BusLogic.c | 1 - drivers/scsi/advansys.c | 1 - drivers/scsi/aha152x.c | 1 - drivers/scsi/aha1542.c | 1 - drivers/scsi/aha1740.c | 1 - drivers/scsi/arcmsr/arcmsr_hba.c | 1 - drivers/scsi/arm/acornscsi.c | 1 - drivers/scsi/arm/cumana_1.c | 1 - drivers/scsi/arm/oak.c | 1 - drivers/scsi/atp870u.c | 1 - drivers/scsi/dtc.c | 1 - drivers/scsi/fd_mcs.c | 1 - drivers/scsi/fdomain.c | 1 - drivers/scsi/g_NCR5380.c | 1 - drivers/scsi/gdth.c | 1 - drivers/scsi/ibmmca.c | 1 - drivers/scsi/in2000.c | 1 - drivers/scsi/mac53c94.c | 1 - drivers/scsi/mac_scsi.c | 1 - drivers/scsi/mesh.c | 1 - drivers/scsi/ncr53c8xx.c | 1 - drivers/scsi/nsp32.c | 1 - drivers/scsi/osst.c | 1 - drivers/scsi/pas16.c | 1 - drivers/scsi/qla1280.c | 1 - drivers/scsi/qlogicpti.c | 1 - drivers/scsi/st.c | 1 - drivers/scsi/sun3_scsi.c | 1 - drivers/scsi/sun3_scsi_vme.c | 1 - drivers/scsi/sym53c416.c | 1 - drivers/scsi/t128.c | 1 - drivers/scsi/u14-34f.c | 1 - drivers/scsi/ultrastor.c | 1 - drivers/scsi/wd7000.c | 1 - drivers/spi/spi-omap-uwire.c | 1 - drivers/staging/comedi/drivers.c | 1 - drivers/staging/comedi/drivers/cb_pcidas64.c | 1 - drivers/staging/comedi/drivers/mite.c | 1 - drivers/staging/crystalhd/crystalhd.h | 1 - drivers/staging/crystalhd/crystalhd_lnx.h | 1 - drivers/staging/crystalhd/crystalhd_misc.h | 1 + drivers/staging/et131x/et131x.c | 1 - drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c | 1 - drivers/staging/media/go7007/go7007-driver.c | 1 - drivers/staging/media/go7007/go7007-i2c.c | 1 - drivers/staging/media/go7007/go7007-v4l2.c | 1 - drivers/staging/media/go7007/snd-go7007.c | 1 - drivers/staging/media/lirc/lirc_serial.c | 1 - drivers/staging/media/lirc/lirc_sir.c | 1 - drivers/staging/panel/panel.c | 1 - drivers/staging/sbe-2t3e3/io.c | 1 - drivers/staging/telephony/phonedev.c | 1 - drivers/staging/tidspbridge/include/dspbridge/host_os.h | 1 - drivers/staging/wlags49_h2/wl_cs.c | 1 - drivers/staging/wlags49_h2/wl_main.c | 3 +-- drivers/staging/wlags49_h2/wl_netdev.c | 3 +-- drivers/staging/wlags49_h2/wl_pci.c | 1 - drivers/staging/wlags49_h2/wl_util.c | 3 +-- drivers/tty/amiserial.c | 1 - drivers/tty/isicom.c | 1 - drivers/tty/moxa.c | 1 - drivers/tty/mxser.c | 1 - drivers/tty/n_hdlc.c | 1 - drivers/tty/n_tty.c | 1 - drivers/tty/pty.c | 1 - drivers/tty/serial/68328serial.c | 1 - drivers/tty/serial/8250/serial_cs.c | 1 - drivers/tty/serial/crisv10.c | 1 - drivers/tty/serial/dz.c | 1 - drivers/tty/serial/icom.c | 1 - drivers/tty/serial/msm_serial_hs.c | 1 - drivers/tty/serial/zs.c | 1 - drivers/tty/synclink.c | 1 - drivers/tty/synclink_gt.c | 1 - drivers/tty/synclinkmp.c | 1 - drivers/tty/tty_io.c | 1 - drivers/tty/tty_ioctl.c | 1 - drivers/tty/vt/vt.c | 1 - drivers/usb/gadget/amd5536udc.c | 1 - drivers/usb/gadget/at91_udc.c | 1 - drivers/usb/gadget/dummy_hcd.c | 1 - drivers/usb/gadget/fsl_udc_core.c | 1 - drivers/usb/gadget/goku_udc.c | 1 - drivers/usb/gadget/langwell_udc.c | 1 - drivers/usb/gadget/mv_udc_core.c | 1 - drivers/usb/gadget/net2272.c | 1 - drivers/usb/gadget/net2280.c | 1 - drivers/usb/gadget/omap_udc.c | 1 - drivers/usb/gadget/printer.c | 1 - drivers/usb/gadget/pxa25x_udc.c | 1 - drivers/usb/gadget/rndis.c | 1 - drivers/usb/gadget/s3c2410_udc.c | 1 - drivers/usb/host/ehci-hcd.c | 1 - drivers/usb/host/isp116x-hcd.c | 1 - drivers/usb/host/isp1362-hcd.c | 1 - drivers/usb/host/ohci-hcd.c | 1 - drivers/usb/host/oxu210hp-hcd.c | 1 - drivers/usb/host/sl811-hcd.c | 1 - drivers/usb/host/u132-hcd.c | 1 - drivers/usb/host/uhci-hcd.c | 1 - drivers/video/amifb.c | 1 - drivers/video/bt431.h | 1 - drivers/video/bt455.h | 1 - drivers/video/console/fbcon.c | 1 - drivers/video/console/newport_con.c | 1 - drivers/video/cyber2000fb.c | 1 - drivers/video/dnfb.c | 1 - drivers/video/neofb.c | 1 - drivers/video/pmag-ba-fb.c | 1 - drivers/video/pmagb-b-fb.c | 1 - drivers/video/q40fb.c | 1 - drivers/video/savage/savagefb_driver.c | 1 - drivers/virtio/config.c | 1 - drivers/watchdog/advantechwdt.c | 1 - drivers/watchdog/alim7101_wdt.c | 1 - drivers/watchdog/booke_wdt.c | 1 - drivers/watchdog/eurotechwdt.c | 1 - drivers/watchdog/ib700wdt.c | 1 - drivers/watchdog/it87_wdt.c | 1 - drivers/watchdog/machzwd.c | 1 - drivers/watchdog/pc87413_wdt.c | 1 - drivers/watchdog/sbc60xxwdt.c | 1 - drivers/watchdog/sbc7240_wdt.c | 1 - drivers/watchdog/sbc8360.c | 1 - drivers/watchdog/sbc_fitpc2_wdt.c | 1 - drivers/watchdog/sc520_wdt.c | 1 - drivers/watchdog/smsc37b787_wdt.c | 1 - drivers/watchdog/w83627hf_wdt.c | 1 - drivers/watchdog/w83697hf_wdt.c | 1 - drivers/watchdog/w83697ug_wdt.c | 1 - drivers/watchdog/w83877f_wdt.c | 1 - drivers/watchdog/w83977f_wdt.c | 1 - drivers/watchdog/wdt.c | 1 - drivers/watchdog/wdt977.c | 1 - drivers/watchdog/wdt_pci.c | 1 - fs/binfmt_aout.c | 1 - fs/binfmt_flat.c | 1 - fs/coda/inode.c | 1 - fs/coda/psdev.c | 1 - fs/coda/upcall.c | 1 - fs/eventpoll.c | 1 - fs/jbd2/commit.c | 1 - fs/jbd2/journal.c | 1 - fs/ncpfs/file.c | 1 - fs/ncpfs/inode.c | 1 - fs/ncpfs/mmap.c | 1 - fs/nfs/client.c | 1 - fs/nfs/direct.c | 1 - fs/nfs/file.c | 1 - fs/nfs/getroot.c | 1 - fs/nfs/inode.c | 1 - fs/nfs/read.c | 1 - fs/nfs/super.c | 1 - fs/proc/inode.c | 1 - fs/reiserfs/journal.c | 1 - fs/ufs/inode.c | 1 - fs/ufs/super.c | 1 - fs/xfs/xfs_buf.h | 1 - include/acpi/platform/aclinux.h | 1 - include/asm-generic/atomic.h | 1 - include/linux/cnt32_to_63.h | 1 - include/linux/debug_locks.h | 1 - include/linux/efi.h | 1 - include/linux/ide.h | 1 - include/linux/interrupt.h | 1 - include/linux/lsm_audit.h | 1 - include/linux/mtd/map.h | 1 - include/linux/parport.h | 1 - include/linux/rwsem.h | 1 - include/linux/sched.h | 1 - include/linux/skbuff.h | 1 - include/linux/spinlock.h | 1 - include/linux/stop_machine.h | 1 - include/linux/tty.h | 1 - include/linux/wait.h | 1 - kernel/debug/debug_core.c | 1 - kernel/debug/kdb/kdb_bt.c | 1 - kernel/dma.c | 1 - kernel/kexec.c | 1 - kernel/rwsem.c | 1 - kernel/sysctl.c | 1 - lib/llist.c | 1 - lib/raid6/altivec.uc | 1 - net/802/fc.c | 1 - net/802/fddi.c | 1 - net/802/hippi.c | 1 - net/802/tr.c | 1 - net/atm/clip.c | 1 - net/ax25/af_ax25.c | 1 - net/ax25/ax25_addr.c | 1 - net/ax25/ax25_dev.c | 1 - net/ax25/ax25_ds_in.c | 1 - net/ax25/ax25_ds_subr.c | 1 - net/ax25/ax25_ds_timer.c | 1 - net/ax25/ax25_iface.c | 1 - net/ax25/ax25_in.c | 1 - net/ax25/ax25_ip.c | 1 - net/ax25/ax25_out.c | 1 - net/ax25/ax25_route.c | 1 - net/ax25/ax25_std_in.c | 1 - net/ax25/ax25_std_subr.c | 1 - net/ax25/ax25_std_timer.c | 1 - net/ax25/ax25_subr.c | 1 - net/ax25/ax25_timer.c | 1 - net/ax25/ax25_uid.c | 1 - net/bluetooth/bnep/sock.c | 1 - net/bluetooth/cmtp/sock.c | 1 - net/bluetooth/hci_conn.c | 1 - net/bluetooth/hci_core.c | 1 - net/bluetooth/hci_event.c | 1 - net/bluetooth/hci_sock.c | 1 - net/bluetooth/l2cap_core.c | 1 - net/bluetooth/rfcomm/sock.c | 1 - net/bluetooth/sco.c | 1 - net/core/datagram.c | 1 - net/core/dev.c | 1 - net/core/filter.c | 1 - net/core/gen_estimator.c | 1 - net/core/rtnetlink.c | 1 - net/core/scm.c | 1 - net/core/skbuff.c | 1 - net/core/sock.c | 1 - net/core/utils.c | 1 - net/decnet/af_decnet.c | 1 - net/decnet/dn_dev.c | 1 - net/decnet/dn_nsp_in.c | 1 - net/decnet/dn_nsp_out.c | 1 - net/econet/af_econet.c | 1 - net/ethernet/eth.c | 1 - net/ipv4/af_inet.c | 1 - net/ipv4/arp.c | 1 - net/ipv4/devinet.c | 1 - net/ipv4/fib_frontend.c | 1 - net/ipv4/fib_semantics.c | 1 - net/ipv4/fib_trie.c | 1 - net/ipv4/icmp.c | 1 - net/ipv4/igmp.c | 1 - net/ipv4/ip_input.c | 1 - net/ipv4/ip_output.c | 1 - net/ipv4/ipmr.c | 1 - net/ipv4/ping.c | 1 - net/ipv4/route.c | 1 - net/ipv4/udp.c | 1 - net/ipv6/af_inet6.c | 1 - net/ipv6/icmp.c | 1 - net/ipv6/ip6mr.c | 1 - net/irda/irlan/irlan_client.c | 1 - net/irda/irlan/irlan_common.c | 1 - net/irda/irlan/irlan_provider.c | 1 - net/irda/timer.c | 1 - net/lapb/lapb_iface.c | 1 - net/lapb/lapb_in.c | 1 - net/lapb/lapb_out.c | 1 - net/lapb/lapb_subr.c | 1 - net/lapb/lapb_timer.c | 1 - net/netfilter/ipvs/ip_vs_app.c | 1 - net/netfilter/ipvs/ip_vs_proto.c | 1 - net/netfilter/nfnetlink.c | 1 - net/netrom/af_netrom.c | 1 - net/netrom/nr_dev.c | 1 - net/netrom/nr_in.c | 1 - net/netrom/nr_out.c | 1 - net/netrom/nr_route.c | 1 - net/netrom/nr_subr.c | 1 - net/netrom/nr_timer.c | 1 - net/openvswitch/datapath.c | 1 - net/packet/af_packet.c | 1 - net/rose/af_rose.c | 1 - net/rose/rose_dev.c | 1 - net/rose/rose_in.c | 1 - net/rose/rose_link.c | 1 - net/rose/rose_out.c | 1 - net/rose/rose_route.c | 1 - net/rose/rose_subr.c | 1 - net/rose/rose_timer.c | 1 - net/sunrpc/clnt.c | 1 - security/selinux/include/avc.h | 1 - sound/oss/os.h | 1 - sound/oss/vidc.c | 1 - sound/oss/waveartist.c | 1 - sound/pci/asihpi/hpios.h | 1 - sound/pci/aw2/aw2-saa7146.c | 1 - 510 files changed, 4 insertions(+), 512 deletions(-) (limited to 'kernel') diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h index 69c0252345f1..21a423bae5e8 100644 --- a/arch/um/include/asm/fixmap.h +++ b/arch/um/include/asm/fixmap.h @@ -2,7 +2,6 @@ #define __UM_FIXMAP_H #include -#include #include #include #include diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c index 2801b418d7bb..d4d9cb7e016a 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -46,7 +46,6 @@ #include #include -#include #include #include #include diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index 6ff612d099c3..2059ee460b0c 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c index 5072f8ac16fd..86fed1b91695 100644 --- a/drivers/atm/firestream.c +++ b/drivers/atm/firestream.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/atm/horizon.c b/drivers/atm/horizon.c index b81210330aca..75fd691cd43e 100644 --- a/drivers/atm/horizon.c +++ b/drivers/atm/horizon.c @@ -43,7 +43,6 @@ #include #include -#include #include #include #include diff --git a/drivers/atm/idt77105.c b/drivers/atm/idt77105.c index 487a54739854..45d506363aba 100644 --- a/drivers/atm/idt77105.c +++ b/drivers/atm/idt77105.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/drivers/atm/iphase.c b/drivers/atm/iphase.c index 9e373ba20308..d4386019af5d 100644 --- a/drivers/atm/iphase.c +++ b/drivers/atm/iphase.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/atm/suni.c b/drivers/atm/suni.c index 90f1ccca9e52..02159345566c 100644 --- a/drivers/atm/suni.c +++ b/drivers/atm/suni.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/atm/zatm.c b/drivers/atm/zatm.c index d889f56e8d8c..abe4e20b0766 100644 --- a/drivers/atm/zatm.c +++ b/drivers/atm/zatm.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 744f078f4dd8..76a08236430a 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -202,7 +202,6 @@ static int slow_floppy; #include #include -#include static int FLOPPY_IRQ = 6; static int FLOPPY_DMA = 2; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index b52c9ca146fc..bf397bf108b7 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -44,7 +44,6 @@ #define HD_IRQ 14 #define REALLY_SLOW_IO -#include #include #include diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c3f0ee16594d..c7ba11f9b203 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -34,7 +34,6 @@ #include #include -#include #include #include diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 51a972704db5..ff540520bada 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -52,7 +52,6 @@ #include #include -#include #include #include diff --git a/drivers/bluetooth/bt3c_cs.c b/drivers/bluetooth/bt3c_cs.c index 9c09d6f05dc9..308c8599ab55 100644 --- a/drivers/bluetooth/bt3c_cs.c +++ b/drivers/bluetooth/bt3c_cs.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include diff --git a/drivers/bluetooth/btuart_cs.c b/drivers/bluetooth/btuart_cs.c index 194224d07f7c..c4fc2f3fc32c 100644 --- a/drivers/bluetooth/btuart_cs.c +++ b/drivers/bluetooth/btuart_cs.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/drivers/bluetooth/dtl1_cs.c b/drivers/bluetooth/dtl1_cs.c index 049c0594a76b..6e8d96189684 100644 --- a/drivers/bluetooth/dtl1_cs.c +++ b/drivers/bluetooth/dtl1_cs.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c index f4837a893dfa..57501ca9204b 100644 --- a/drivers/char/apm-emulation.c +++ b/drivers/char/apm-emulation.c @@ -31,7 +31,6 @@ #include #include -#include /* * The apm_bios device is one of the misc char devices. diff --git a/drivers/char/ds1302.c b/drivers/char/ds1302.c index ed8303f9890c..7d34b203718a 100644 --- a/drivers/char/ds1302.c +++ b/drivers/char/ds1302.c @@ -24,7 +24,6 @@ #include #include -#include #include #if defined(CONFIG_M32R) #include diff --git a/drivers/char/efirtc.c b/drivers/char/efirtc.c index 53c524e7b829..a082d00b0f11 100644 --- a/drivers/char/efirtc.c +++ b/drivers/char/efirtc.c @@ -37,7 +37,6 @@ #include #include -#include #define EFI_RTC_VERSION "0.4" diff --git a/drivers/char/genrtc.c b/drivers/char/genrtc.c index f773a9dd14f3..21cb980f1157 100644 --- a/drivers/char/genrtc.c +++ b/drivers/char/genrtc.c @@ -56,7 +56,6 @@ #include #include -#include #include /* diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c index 0833896cf6f2..3845ab44c330 100644 --- a/drivers/char/hpet.c +++ b/drivers/char/hpet.c @@ -36,7 +36,6 @@ #include #include -#include #include #include diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c index 2aa3977aae5e..9eb360ff8cab 100644 --- a/drivers/char/ipmi/ipmi_devintf.c +++ b/drivers/char/ipmi/ipmi_devintf.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c index 58c0e6387cf7..c90e9390b78c 100644 --- a/drivers/char/ipmi/ipmi_msghandler.c +++ b/drivers/char/ipmi/ipmi_msghandler.c @@ -33,7 +33,6 @@ #include #include -#include #include #include #include diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c index 50fcf9c04569..f9fdc114b31d 100644 --- a/drivers/char/ipmi/ipmi_si_intf.c +++ b/drivers/char/ipmi/ipmi_si_intf.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include diff --git a/drivers/char/lp.c b/drivers/char/lp.c index f43485607063..0fbf1a776b52 100644 --- a/drivers/char/lp.c +++ b/drivers/char/lp.c @@ -135,7 +135,6 @@ #include #include -#include /* if you have more than 8 printers, remember to increase LP_NO */ #define LP_NO 8 diff --git a/drivers/char/mbcs.c b/drivers/char/mbcs.c index 1aeaaba680d2..47ff7e470d87 100644 --- a/drivers/char/mbcs.c +++ b/drivers/char/mbcs.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index 5c0d96a820fa..8b78750f1efe 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/mwave/3780i.c b/drivers/char/mwave/3780i.c index 492dbfb2efd6..881c9e595939 100644 --- a/drivers/char/mwave/3780i.c +++ b/drivers/char/mwave/3780i.c @@ -56,7 +56,6 @@ #include #include -#include #include #include "smapi.h" #include "mwavedd.h" diff --git a/drivers/char/nvram.c b/drivers/char/nvram.c index eaade8a1ecd7..9df78e2cc45d 100644 --- a/drivers/char/nvram.c +++ b/drivers/char/nvram.c @@ -111,7 +111,6 @@ #include #include -#include static DEFINE_MUTEX(nvram_mutex); static DEFINE_SPINLOCK(nvram_state_lock); diff --git a/drivers/char/nwflash.c b/drivers/char/nwflash.c index bf586ae1ee83..d45c3345b4af 100644 --- a/drivers/char/nwflash.c +++ b/drivers/char/nwflash.c @@ -32,7 +32,6 @@ #include #include #include -#include #include /*****************************************************************************/ diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c index f6453df4921c..0a484b4a1b02 100644 --- a/drivers/char/pcmcia/synclink_cs.c +++ b/drivers/char/pcmcia/synclink_cs.c @@ -60,7 +60,6 @@ #include #include -#include #include #include #include diff --git a/drivers/char/rtc.c b/drivers/char/rtc.c index 872e09a02d23..af9437488b6c 100644 --- a/drivers/char/rtc.c +++ b/drivers/char/rtc.c @@ -83,7 +83,6 @@ #include #include -#include #ifdef CONFIG_X86 #include diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c index 1ee8ce7d2762..45713f0e7d61 100644 --- a/drivers/char/sonypi.c +++ b/drivers/char/sonypi.c @@ -54,7 +54,6 @@ #include #include -#include #include diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c index e90e1c74fd4c..31ba11ca75e1 100644 --- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c +++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c @@ -89,7 +89,6 @@ #include #include -#include #ifdef CONFIG_OF /* For open firmware. */ diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c index 5d04c57aae30..3093ca6152c9 100644 --- a/drivers/cpufreq/omap-cpufreq.c +++ b/drivers/cpufreq/omap-cpufreq.c @@ -26,7 +26,6 @@ #include #include -#include #include #include diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c index cf7e1ee005a2..334cc2f1e9f1 100644 --- a/drivers/cpufreq/powernow-k7.c +++ b/drivers/cpufreq/powernow-k7.c @@ -27,7 +27,6 @@ #include /* Needed for recalibrate_cpu_khz() */ #include -#include #include #ifdef CONFIG_X86_POWERNOW_K7_ACPI diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index 22c6df5f136d..2e6b24547e2a 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -44,7 +44,6 @@ #include #include -#include #include "core.h" diff --git a/drivers/firewire/core-device.c b/drivers/firewire/core-device.c index afa7c83bd114..68109e9bb04e 100644 --- a/drivers/firewire/core-device.c +++ b/drivers/firewire/core-device.c @@ -40,7 +40,6 @@ #include #include -#include #include "core.h" diff --git a/drivers/firewire/core-topology.c b/drivers/firewire/core-topology.c index 255646ffc352..0de83508f321 100644 --- a/drivers/firewire/core-topology.c +++ b/drivers/firewire/core-topology.c @@ -31,7 +31,6 @@ #include #include -#include #include "core.h" diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 187b3f2e797e..2b5460075a9f 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -46,7 +46,6 @@ #include #include -#include #ifdef CONFIG_PPC_PMAC #include diff --git a/drivers/firewire/sbp2.c b/drivers/firewire/sbp2.c index 000a29ffedae..b7e65d7eab64 100644 --- a/drivers/firewire/sbp2.c +++ b/drivers/firewire/sbp2.c @@ -52,7 +52,6 @@ #include #include -#include #include #include diff --git a/drivers/i2c/busses/i2c-acorn.c b/drivers/i2c/busses/i2c-acorn.c index 86796488ef4f..ed9f48d566db 100644 --- a/drivers/i2c/busses/i2c-acorn.c +++ b/drivers/i2c/busses/i2c-acorn.c @@ -19,7 +19,6 @@ #include #include -#include #define FORCE_ONES 0xdc #define SCL 0x02 diff --git a/drivers/ide/ide-cs.c b/drivers/ide/ide-cs.c index d2f3db3cf3ed..28e344ea514c 100644 --- a/drivers/ide/ide-cs.c +++ b/drivers/ide/ide-cs.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include diff --git a/drivers/ide/qd65xx.c b/drivers/ide/qd65xx.c index 8bbfe5557c7b..e03f4f19c1d6 100644 --- a/drivers/ide/qd65xx.c +++ b/drivers/ide/qd65xx.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #define DRV_NAME "qd65xx" diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c index 9a3fbfca9b41..fd05f48f6b0b 100644 --- a/drivers/infiniband/hw/ehca/ehca_reqs.c +++ b/drivers/infiniband/hw/ehca/ehca_reqs.c @@ -42,7 +42,6 @@ */ -#include #include "ehca_classes.h" #include "ehca_tools.h" #include "ehca_qes.h" diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c index c24ec2d5f926..26043cc6a016 100644 --- a/drivers/input/joydev.c +++ b/drivers/input/joydev.c @@ -13,7 +13,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include -#include #include #include #include diff --git a/drivers/input/joystick/amijoy.c b/drivers/input/joystick/amijoy.c index 0bc86204213e..24044dacbf70 100644 --- a/drivers/input/joystick/amijoy.c +++ b/drivers/input/joystick/amijoy.c @@ -35,7 +35,6 @@ #include #include -#include #include #include diff --git a/drivers/input/mouse/amimouse.c b/drivers/input/mouse/amimouse.c index ff5f61a0fd3a..5fa99341a39d 100644 --- a/drivers/input/mouse/amimouse.c +++ b/drivers/input/mouse/amimouse.c @@ -25,7 +25,6 @@ #include #include -#include #include #include #include diff --git a/drivers/input/mouse/atarimouse.c b/drivers/input/mouse/atarimouse.c index 5c4a692bf73a..d1c43236b125 100644 --- a/drivers/input/mouse/atarimouse.c +++ b/drivers/input/mouse/atarimouse.c @@ -47,7 +47,6 @@ #include #include -#include #include #include #include diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c index be3316073ae7..09a089996ded 100644 --- a/drivers/input/serio/hp_sdc.c +++ b/drivers/input/serio/hp_sdc.c @@ -71,7 +71,6 @@ #include #include #include -#include /* Machine-specific abstraction */ diff --git a/drivers/input/serio/maceps2.c b/drivers/input/serio/maceps2.c index 558200e96d0f..61da763b1209 100644 --- a/drivers/input/serio/maceps2.c +++ b/drivers/input/serio/maceps2.c @@ -21,7 +21,6 @@ #include #include -#include #include #include diff --git a/drivers/input/serio/rpckbd.c b/drivers/input/serio/rpckbd.c index 8b44ddc8041c..d8aac177307f 100644 --- a/drivers/input/serio/rpckbd.c +++ b/drivers/input/serio/rpckbd.c @@ -39,7 +39,6 @@ #include #include #include -#include MODULE_AUTHOR("Vojtech Pavlik, Russell King"); MODULE_DESCRIPTION("Acorn RiscPC PS/2 keyboard controller driver"); diff --git a/drivers/input/serio/sa1111ps2.c b/drivers/input/serio/sa1111ps2.c index 44fc8b4bcd81..e3c85fafe937 100644 --- a/drivers/input/serio/sa1111ps2.c +++ b/drivers/input/serio/sa1111ps2.c @@ -20,7 +20,6 @@ #include #include -#include #include diff --git a/drivers/isdn/hardware/avm/avm_cs.c b/drivers/isdn/hardware/avm/avm_cs.c index 44b50cc645e6..c21353d8e915 100644 --- a/drivers/isdn/hardware/avm/avm_cs.c +++ b/drivers/isdn/hardware/avm/avm_cs.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/drivers/isdn/hisax/avma1_cs.c b/drivers/isdn/hisax/avma1_cs.c index 33e3c94887d8..c644557ae614 100644 --- a/drivers/isdn/hisax/avma1_cs.c +++ b/drivers/isdn/hisax/avma1_cs.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/drivers/isdn/hisax/elsa_cs.c b/drivers/isdn/hisax/elsa_cs.c index fe254e74a850..a8c4d3fc9a6d 100644 --- a/drivers/isdn/hisax/elsa_cs.c +++ b/drivers/isdn/hisax/elsa_cs.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include diff --git a/drivers/isdn/hisax/sedlbauer_cs.c b/drivers/isdn/hisax/sedlbauer_cs.c index 68f50495d166..f0dfc0c976eb 100644 --- a/drivers/isdn/hisax/sedlbauer_cs.c +++ b/drivers/isdn/hisax/sedlbauer_cs.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include diff --git a/drivers/isdn/hisax/teles_cs.c b/drivers/isdn/hisax/teles_cs.c index bfe94284b0d5..4deac451807c 100644 --- a/drivers/isdn/hisax/teles_cs.c +++ b/drivers/isdn/hisax/teles_cs.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/drivers/isdn/i4l/isdn_bsdcomp.c b/drivers/isdn/i4l/isdn_bsdcomp.c index 7f3c54d40474..c59e8d2c0675 100644 --- a/drivers/isdn/i4l/isdn_bsdcomp.c +++ b/drivers/isdn/i4l/isdn_bsdcomp.c @@ -69,7 +69,6 @@ #include /* used in new tty drivers */ #include -#include #include #include diff --git a/drivers/isdn/pcbit/layer2.c b/drivers/isdn/pcbit/layer2.c index 682911f81138..a18e639b40d7 100644 --- a/drivers/isdn/pcbit/layer2.c +++ b/drivers/isdn/pcbit/layer2.c @@ -36,7 +36,6 @@ #include -#include #include diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index b6ef8f590764..87de8d9bcfad 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/therm_adt746x.c b/drivers/macintosh/therm_adt746x.c index c60d025044ee..fc71723cbc48 100644 --- a/drivers/macintosh/therm_adt746x.c +++ b/drivers/macintosh/therm_adt746x.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #undef DEBUG diff --git a/drivers/macintosh/therm_pm72.c b/drivers/macintosh/therm_pm72.c index 0ff92c208005..97cfc5ac9fd0 100644 --- a/drivers/macintosh/therm_pm72.c +++ b/drivers/macintosh/therm_pm72.c @@ -127,7 +127,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/therm_windtunnel.c b/drivers/macintosh/therm_windtunnel.c index 46c4e95f10d6..3b4a157714b1 100644 --- a/drivers/macintosh/therm_windtunnel.c +++ b/drivers/macintosh/therm_windtunnel.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c index 971bc9582a5f..86511c570dd8 100644 --- a/drivers/macintosh/via-cuda.c +++ b/drivers/macintosh/via-cuda.c @@ -26,7 +26,6 @@ #include #endif #include -#include #include static volatile unsigned char __iomem *via; diff --git a/drivers/macintosh/via-macii.c b/drivers/macintosh/via-macii.c index c9570fcf1cce..3725f088f17e 100644 --- a/drivers/macintosh/via-macii.c +++ b/drivers/macintosh/via-macii.c @@ -34,7 +34,6 @@ #include #include #include -#include static volatile unsigned char *via; diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c index 6cccd60c594e..22b8ce4191cc 100644 --- a/drivers/macintosh/via-pmu.c +++ b/drivers/macintosh/via-pmu.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/macintosh/via-pmu68k.c b/drivers/macintosh/via-pmu68k.c index aeb30d07d5a2..a00ee41f0573 100644 --- a/drivers/macintosh/via-pmu68k.c +++ b/drivers/macintosh/via-pmu68k.c @@ -37,7 +37,6 @@ #include #include -#include #include #include diff --git a/drivers/macintosh/windfarm_lm75_sensor.c b/drivers/macintosh/windfarm_lm75_sensor.c index 647c6add2193..4d6a90a1372b 100644 --- a/drivers/macintosh/windfarm_lm75_sensor.c +++ b/drivers/macintosh/windfarm_lm75_sensor.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/windfarm_pm121.c b/drivers/macintosh/windfarm_pm121.c index 30e6195e19d4..04067e073aa9 100644 --- a/drivers/macintosh/windfarm_pm121.c +++ b/drivers/macintosh/windfarm_pm121.c @@ -215,7 +215,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/windfarm_pm81.c b/drivers/macintosh/windfarm_pm81.c index 749d174b0dc6..fc13d0f2663b 100644 --- a/drivers/macintosh/windfarm_pm81.c +++ b/drivers/macintosh/windfarm_pm81.c @@ -107,7 +107,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/windfarm_pm91.c b/drivers/macintosh/windfarm_pm91.c index 344273235124..a9430ed4f36c 100644 --- a/drivers/macintosh/windfarm_pm91.c +++ b/drivers/macintosh/windfarm_pm91.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/windfarm_smu_controls.c b/drivers/macintosh/windfarm_smu_controls.c index 43137b421f92..3c2be5193fd5 100644 --- a/drivers/macintosh/windfarm_smu_controls.c +++ b/drivers/macintosh/windfarm_smu_controls.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/drivers/macintosh/windfarm_smu_sensors.c b/drivers/macintosh/windfarm_smu_sensors.c index 3c193504bb80..1cc4e4953d89 100644 --- a/drivers/macintosh/windfarm_smu_sensors.c +++ b/drivers/macintosh/windfarm_smu_sensors.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c index e4b5c03ae516..73970cd97af1 100644 --- a/drivers/media/dvb/dvb-core/dmxdev.c +++ b/drivers/media/dvb/dvb-core/dmxdev.c @@ -29,7 +29,6 @@ #include #include #include -#include #include "dmxdev.h" static int debug; diff --git a/drivers/media/dvb/firewire/firedtv-fw.c b/drivers/media/dvb/firewire/firedtv-fw.c index 864b6274c729..e24ec539a5fd 100644 --- a/drivers/media/dvb/firewire/firedtv-fw.c +++ b/drivers/media/dvb/firewire/firedtv-fw.c @@ -20,7 +20,6 @@ #include #include -#include #include diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c index 6ecbcf614878..4bd8bd56befc 100644 --- a/drivers/media/dvb/ttpci/av7110.c +++ b/drivers/media/dvb/ttpci/av7110.c @@ -53,7 +53,6 @@ #include #include -#include #include diff --git a/drivers/media/media-devnode.c b/drivers/media/media-devnode.c index 7b42ace419d9..04d6d541d862 100644 --- a/drivers/media/media-devnode.c +++ b/drivers/media/media-devnode.c @@ -40,7 +40,6 @@ #include #include #include -#include #include diff --git a/drivers/media/video/ivtv/ivtv-driver.h b/drivers/media/video/ivtv/ivtv-driver.h index 06f3d78389bf..52d5dd71032e 100644 --- a/drivers/media/video/ivtv/ivtv-driver.h +++ b/drivers/media/video/ivtv/ivtv-driver.h @@ -54,7 +54,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/video/v4l2-common.c b/drivers/media/video/v4l2-common.c index 5c6100fb4072..1baec8393306 100644 --- a/drivers/media/video/v4l2-common.c +++ b/drivers/media/video/v4l2-common.c @@ -55,7 +55,6 @@ #include #endif #include -#include #include #include #include diff --git a/drivers/media/video/v4l2-dev.c b/drivers/media/video/v4l2-dev.c index 96e9615663e9..8546f8129448 100644 --- a/drivers/media/video/v4l2-dev.c +++ b/drivers/media/video/v4l2-dev.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/drivers/message/i2o/i2o_scsi.c b/drivers/message/i2o/i2o_scsi.c index c8ed7b63fdf5..1d31d7284cbd 100644 --- a/drivers/message/i2o/i2o_scsi.c +++ b/drivers/message/i2o/i2o_scsi.c @@ -57,7 +57,6 @@ #include #include -#include #include #include diff --git a/drivers/mfd/mcp-core.c b/drivers/mfd/mcp-core.c index 86cc3f7841cd..32c82de81ada 100644 --- a/drivers/mfd/mcp-core.c +++ b/drivers/mfd/mcp-core.c @@ -20,7 +20,6 @@ #include #include -#include #define to_mcp(d) container_of(d, struct mcp, attached_device) diff --git a/drivers/mfd/mcp-sa11x0.c b/drivers/mfd/mcp-sa11x0.c index 02c53a0766c4..4a27c2b9a891 100644 --- a/drivers/mfd/mcp-sa11x0.c +++ b/drivers/mfd/mcp-sa11x0.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/drivers/misc/sgi-xp/xp.h b/drivers/misc/sgi-xp/xp.h index 851b2f25ce0e..c862cd4583cc 100644 --- a/drivers/misc/sgi-xp/xp.h +++ b/drivers/misc/sgi-xp/xp.h @@ -25,7 +25,6 @@ #endif #if defined CONFIG_IA64 -#include #include /* defines is_shub1() and is_shub2() */ #define is_shub() ia64_platform_is("sn2") #endif diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index e5a3c7b6dedb..4c3b2847e47c 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -41,7 +41,6 @@ #include #include -#include #include #include "queue.h" diff --git a/drivers/mtd/devices/pmc551.c b/drivers/mtd/devices/pmc551.c index ecff765579dd..5d53c5760a6c 100644 --- a/drivers/mtd/devices/pmc551.c +++ b/drivers/mtd/devices/pmc551.c @@ -93,7 +93,6 @@ #include #include #include -#include #include #include diff --git a/drivers/mtd/devices/slram.c b/drivers/mtd/devices/slram.c index e585263161b9..288594163c22 100644 --- a/drivers/mtd/devices/slram.c +++ b/drivers/mtd/devices/slram.c @@ -42,7 +42,6 @@ #include #include #include -#include #include diff --git a/drivers/mtd/maps/pcmciamtd.c b/drivers/mtd/maps/pcmciamtd.c index e8e9fec23553..0259cf583022 100644 --- a/drivers/mtd/maps/pcmciamtd.c +++ b/drivers/mtd/maps/pcmciamtd.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include diff --git a/drivers/mtd/nand/bcm_umi_nand.c b/drivers/mtd/nand/bcm_umi_nand.c index 50387fd4009b..64c9cbaf86a1 100644 --- a/drivers/mtd/nand/bcm_umi_nand.c +++ b/drivers/mtd/nand/bcm_umi_nand.c @@ -31,7 +31,6 @@ #include #include -#include #include #include diff --git a/drivers/net/appletalk/cops.c b/drivers/net/appletalk/cops.c index 9abd4eb86dc1..dd5e04813b76 100644 --- a/drivers/net/appletalk/cops.c +++ b/drivers/net/appletalk/cops.c @@ -70,7 +70,6 @@ static const char *version = #include #include -#include #include #include diff --git a/drivers/net/appletalk/ltpc.c b/drivers/net/appletalk/ltpc.c index 6057b30417a2..0910dce3996d 100644 --- a/drivers/net/appletalk/ltpc.c +++ b/drivers/net/appletalk/ltpc.c @@ -229,7 +229,6 @@ static int dma; #include #include -#include #include #include diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c index 980e65c14936..5bed4c4e2508 100644 --- a/drivers/net/arcnet/com20020_cs.c +++ b/drivers/net/arcnet/com20020_cs.c @@ -47,7 +47,6 @@ #include #include -#include #define VERSION "arcnet: COM20020 PCMCIA support loaded.\n" diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 0730203a19f2..d6e85864beea 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index 98a5a7d867f5..034c16b60e96 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -40,7 +40,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/cris/eth_v10.c b/drivers/net/cris/eth_v10.c index 7cb2785e209d..ec03b401620a 100644 --- a/drivers/net/cris/eth_v10.c +++ b/drivers/net/cris/eth_v10.c @@ -35,7 +35,6 @@ #include /* CRIS_LED_* I/O functions */ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/3com/3c574_cs.c b/drivers/net/ethernet/3com/3c574_cs.c index e61b2f82ba3a..66df93638085 100644 --- a/drivers/net/ethernet/3com/3c574_cs.c +++ b/drivers/net/ethernet/3com/3c574_cs.c @@ -95,7 +95,6 @@ earlier 3Com products. #include #include -#include /*====================================================================*/ diff --git a/drivers/net/ethernet/3com/3c589_cs.c b/drivers/net/ethernet/3com/3c589_cs.c index b23253b9f742..a556c01e011b 100644 --- a/drivers/net/ethernet/3com/3c589_cs.c +++ b/drivers/net/ethernet/3com/3c589_cs.c @@ -50,7 +50,6 @@ #include #include -#include /* To minimize the size of the driver source I only define operating constants if they are used several times. You'll need the manual diff --git a/drivers/net/ethernet/8390/3c503.c b/drivers/net/ethernet/8390/3c503.c index fbab1367505f..49d76bd0dc86 100644 --- a/drivers/net/ethernet/8390/3c503.c +++ b/drivers/net/ethernet/8390/3c503.c @@ -54,7 +54,6 @@ static const char version[] = #include #include -#include #include #include "8390.h" diff --git a/drivers/net/ethernet/8390/ac3200.c b/drivers/net/ethernet/8390/ac3200.c index 5337dd0a59b0..ccf07942ff6e 100644 --- a/drivers/net/ethernet/8390/ac3200.c +++ b/drivers/net/ethernet/8390/ac3200.c @@ -34,7 +34,6 @@ static const char version[] = #include #include -#include #include #include diff --git a/drivers/net/ethernet/8390/apne.c b/drivers/net/ethernet/8390/apne.c index 3ad5d2f9a49c..923959275a82 100644 --- a/drivers/net/ethernet/8390/apne.c +++ b/drivers/net/ethernet/8390/apne.c @@ -39,7 +39,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/8390/ax88796.c b/drivers/net/ethernet/8390/ax88796.c index c30adcc9828a..11476ca95e93 100644 --- a/drivers/net/ethernet/8390/ax88796.c +++ b/drivers/net/ethernet/8390/ax88796.c @@ -31,7 +31,6 @@ #include -#include /* Rename the lib8390.c functions to show that they are in this driver */ #define __ei_open ax_ei_open diff --git a/drivers/net/ethernet/8390/axnet_cs.c b/drivers/net/ethernet/8390/axnet_cs.c index c5bd8eb7a9f5..e1b3941bd149 100644 --- a/drivers/net/ethernet/8390/axnet_cs.c +++ b/drivers/net/ethernet/8390/axnet_cs.c @@ -46,7 +46,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/8390/e2100.c b/drivers/net/ethernet/8390/e2100.c index d16dc53c1813..ed55ce85ebbf 100644 --- a/drivers/net/ethernet/8390/e2100.c +++ b/drivers/net/ethernet/8390/e2100.c @@ -48,7 +48,6 @@ static const char version[] = #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/es3210.c b/drivers/net/ethernet/8390/es3210.c index 6428f9e7a554..ba1b5c95531f 100644 --- a/drivers/net/ethernet/8390/es3210.c +++ b/drivers/net/ethernet/8390/es3210.c @@ -59,7 +59,6 @@ static const char version[] = #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/etherh.c b/drivers/net/ethernet/8390/etherh.c index a45b0d8a9f12..dbefd5658c14 100644 --- a/drivers/net/ethernet/8390/etherh.c +++ b/drivers/net/ethernet/8390/etherh.c @@ -45,7 +45,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/8390/hp-plus.c b/drivers/net/ethernet/8390/hp-plus.c index d42938b6b596..52f70f999c00 100644 --- a/drivers/net/ethernet/8390/hp-plus.c +++ b/drivers/net/ethernet/8390/hp-plus.c @@ -33,7 +33,6 @@ static const char version[] = #include #include -#include #include #include "8390.h" diff --git a/drivers/net/ethernet/8390/hp.c b/drivers/net/ethernet/8390/hp.c index 113f1e075a26..37fa89aa4578 100644 --- a/drivers/net/ethernet/8390/hp.c +++ b/drivers/net/ethernet/8390/hp.c @@ -33,7 +33,6 @@ static const char version[] = #include #include -#include #include #include "8390.h" diff --git a/drivers/net/ethernet/8390/lib8390.c b/drivers/net/ethernet/8390/lib8390.c index e77f624e8194..b329f5c0d62b 100644 --- a/drivers/net/ethernet/8390/lib8390.c +++ b/drivers/net/ethernet/8390/lib8390.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/8390/lne390.c b/drivers/net/ethernet/8390/lne390.c index 69490ae018ea..479409bf2e3c 100644 --- a/drivers/net/ethernet/8390/lne390.c +++ b/drivers/net/ethernet/8390/lne390.c @@ -46,7 +46,6 @@ static const char *version = #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/mac8390.c b/drivers/net/ethernet/8390/mac8390.c index af5d9822cad9..88ccc8b14f0a 100644 --- a/drivers/net/ethernet/8390/mac8390.c +++ b/drivers/net/ethernet/8390/mac8390.c @@ -37,7 +37,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/8390/ne-h8300.c b/drivers/net/ethernet/8390/ne-h8300.c index 9b9c77d5a65c..7fc28f2d28a6 100644 --- a/drivers/net/ethernet/8390/ne-h8300.c +++ b/drivers/net/ethernet/8390/ne-h8300.c @@ -29,7 +29,6 @@ static const char version1[] = #include #include -#include #include #include diff --git a/drivers/net/ethernet/8390/ne.c b/drivers/net/ethernet/8390/ne.c index f92ea2a65a57..d04911d33b64 100644 --- a/drivers/net/ethernet/8390/ne.c +++ b/drivers/net/ethernet/8390/ne.c @@ -53,7 +53,6 @@ static const char version2[] = #include #include -#include #include #include "8390.h" diff --git a/drivers/net/ethernet/8390/ne2.c b/drivers/net/ethernet/8390/ne2.c index 922b32036c63..ef85839f43d8 100644 --- a/drivers/net/ethernet/8390/ne2.c +++ b/drivers/net/ethernet/8390/ne2.c @@ -76,7 +76,6 @@ static const char *version = "ne2.c:v0.91 Nov 16 1998 Wim Dumon #include -#include #include #include diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c index 3fab04a0034a..5e8845febfb8 100644 --- a/drivers/net/ethernet/8390/ne2k-pci.c +++ b/drivers/net/ethernet/8390/ne2k-pci.c @@ -54,7 +54,6 @@ static int options[MAX_UNITS]; #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/8390/ne3210.c b/drivers/net/ethernet/8390/ne3210.c index 2a3e8057feae..a2f8b2b8e27c 100644 --- a/drivers/net/ethernet/8390/ne3210.c +++ b/drivers/net/ethernet/8390/ne3210.c @@ -39,7 +39,6 @@ #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/pcnet_cs.c b/drivers/net/ethernet/8390/pcnet_cs.c index f2a4e5de18c4..de1af0bfed4c 100644 --- a/drivers/net/ethernet/8390/pcnet_cs.c +++ b/drivers/net/ethernet/8390/pcnet_cs.c @@ -49,7 +49,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/8390/smc-mca.c b/drivers/net/ethernet/8390/smc-mca.c index 77efec44fea0..7a68590f2804 100644 --- a/drivers/net/ethernet/8390/smc-mca.c +++ b/drivers/net/ethernet/8390/smc-mca.c @@ -47,7 +47,6 @@ #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/smc-ultra.c b/drivers/net/ethernet/8390/smc-ultra.c index 1cc306a83ff7..b0fbce39661a 100644 --- a/drivers/net/ethernet/8390/smc-ultra.c +++ b/drivers/net/ethernet/8390/smc-ultra.c @@ -69,7 +69,6 @@ static const char version[] = #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/smc-ultra32.c b/drivers/net/ethernet/8390/smc-ultra32.c index bb87053eb3da..923e42aedcfd 100644 --- a/drivers/net/ethernet/8390/smc-ultra32.c +++ b/drivers/net/ethernet/8390/smc-ultra32.c @@ -57,7 +57,6 @@ static const char *version = "smc-ultra32.c: 06/97 v1.00\n"; #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/stnic.c b/drivers/net/ethernet/8390/stnic.c index 3b903759980a..8df4c4157230 100644 --- a/drivers/net/ethernet/8390/stnic.c +++ b/drivers/net/ethernet/8390/stnic.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/8390/wd.c b/drivers/net/ethernet/8390/wd.c index c175fadb597b..03eb3eed49fa 100644 --- a/drivers/net/ethernet/8390/wd.c +++ b/drivers/net/ethernet/8390/wd.c @@ -39,7 +39,6 @@ static const char version[] = #include #include -#include #include "8390.h" diff --git a/drivers/net/ethernet/8390/zorro8390.c b/drivers/net/ethernet/8390/zorro8390.c index bcd27323b203..7818e6397e91 100644 --- a/drivers/net/ethernet/8390/zorro8390.c +++ b/drivers/net/ethernet/8390/zorro8390.c @@ -31,7 +31,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c index 6c3b1c0adaa0..7219123fa0a4 100644 --- a/drivers/net/ethernet/alteon/acenic.c +++ b/drivers/net/ethernet/alteon/acenic.c @@ -78,7 +78,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/amd/7990.c b/drivers/net/ethernet/amd/7990.c index 1b046f58d58f..6e722dc37db7 100644 --- a/drivers/net/ethernet/amd/7990.c +++ b/drivers/net/ethernet/amd/7990.c @@ -33,7 +33,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/amd/am79c961a.c b/drivers/net/ethernet/amd/am79c961a.c index cc7b9e46780c..e10ffad525a7 100644 --- a/drivers/net/ethernet/amd/am79c961a.c +++ b/drivers/net/ethernet/amd/am79c961a.c @@ -30,7 +30,6 @@ #include #include -#include #define TX_BUFFERS 15 #define RX_BUFFERS 25 diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c index 9f62504d0086..64d0d9c1afa2 100644 --- a/drivers/net/ethernet/amd/amd8111e.c +++ b/drivers/net/ethernet/amd/amd8111e.c @@ -88,7 +88,6 @@ Revision History: #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/amd/declance.c b/drivers/net/ethernet/amd/declance.c index 7dc508e5c72e..75299f500ee5 100644 --- a/drivers/net/ethernet/amd/declance.c +++ b/drivers/net/ethernet/amd/declance.c @@ -64,7 +64,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/amd/hplance.c b/drivers/net/ethernet/amd/hplance.c index 4e2d68a4de8a..8baff4e5d964 100644 --- a/drivers/net/ethernet/amd/hplance.c +++ b/drivers/net/ethernet/amd/hplance.c @@ -22,7 +22,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/amd/mvme147.c b/drivers/net/ethernet/amd/mvme147.c index 56bc47a94186..9af3c307862c 100644 --- a/drivers/net/ethernet/amd/mvme147.c +++ b/drivers/net/ethernet/amd/mvme147.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/amd/nmclan_cs.c b/drivers/net/ethernet/amd/nmclan_cs.c index ebdb9e238a8d..9f59bf63514b 100644 --- a/drivers/net/ethernet/amd/nmclan_cs.c +++ b/drivers/net/ethernet/amd/nmclan_cs.c @@ -154,7 +154,6 @@ Include Files #include #include -#include /* ---------------------------------------------------------------------------- Defines diff --git a/drivers/net/ethernet/amd/sunlance.c b/drivers/net/ethernet/amd/sunlance.c index e3fe3504e198..d7a3533d990b 100644 --- a/drivers/net/ethernet/amd/sunlance.c +++ b/drivers/net/ethernet/amd/sunlance.c @@ -95,7 +95,6 @@ static char lancestr[] = "LANCE"; #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index b0657466041d..a374f2788347 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -48,7 +48,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/cirrus/cs89x0.c b/drivers/net/ethernet/cirrus/cs89x0.c index d5ff93653e4c..98c171b3a45e 100644 --- a/drivers/net/ethernet/cirrus/cs89x0.c +++ b/drivers/net/ethernet/cirrus/cs89x0.c @@ -148,7 +148,6 @@ #include #include -#include #include #include #if ALLOW_DMA diff --git a/drivers/net/ethernet/cirrus/mac89x0.c b/drivers/net/ethernet/cirrus/mac89x0.c index 932fdccc339a..e285f384b096 100644 --- a/drivers/net/ethernet/cirrus/mac89x0.c +++ b/drivers/net/ethernet/cirrus/mac89x0.c @@ -99,7 +99,6 @@ static char *version = #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/dlink/de600.c b/drivers/net/ethernet/dlink/de600.c index 682750c052c8..414f0eea1049 100644 --- a/drivers/net/ethernet/dlink/de600.c +++ b/drivers/net/ethernet/dlink/de600.c @@ -46,7 +46,6 @@ static const char version[] = "de600.c: $Revision: 1.41-2.5 $, Bjorn Ekwall (bj #include #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/dlink/de620.c b/drivers/net/ethernet/dlink/de620.c index afc5aaac6b60..2e2bc60ee811 100644 --- a/drivers/net/ethernet/dlink/de620.c +++ b/drivers/net/ethernet/dlink/de620.c @@ -122,7 +122,6 @@ static const char version[] = #include #include -#include /* Constant definitions for the DE-620 registers, commands and bits */ #include "de620.h" diff --git a/drivers/net/ethernet/fujitsu/at1700.c b/drivers/net/ethernet/fujitsu/at1700.c index 586b46fd4eed..3d94797c8f9b 100644 --- a/drivers/net/ethernet/fujitsu/at1700.c +++ b/drivers/net/ethernet/fujitsu/at1700.c @@ -52,7 +52,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/fujitsu/eth16i.c b/drivers/net/ethernet/fujitsu/eth16i.c index c3f0178fb5cb..a992d1f7e0d2 100644 --- a/drivers/net/ethernet/fujitsu/eth16i.c +++ b/drivers/net/ethernet/fujitsu/eth16i.c @@ -163,7 +163,6 @@ static char *version = #include #include -#include #include diff --git a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c index 0230319ddb59..2418faf2251a 100644 --- a/drivers/net/ethernet/fujitsu/fmvj18x_cs.c +++ b/drivers/net/ethernet/fujitsu/fmvj18x_cs.c @@ -57,7 +57,6 @@ #include #include -#include /*====================================================================*/ diff --git a/drivers/net/ethernet/i825xx/3c507.c b/drivers/net/ethernet/i825xx/3c507.c index ed6925f11479..e8984b059905 100644 --- a/drivers/net/ethernet/i825xx/3c507.c +++ b/drivers/net/ethernet/i825xx/3c507.c @@ -63,7 +63,6 @@ static const char version[] = #include #include -#include #include /* use 0 for production, 1 for verification, 2..7 for debug */ diff --git a/drivers/net/ethernet/i825xx/3c527.c b/drivers/net/ethernet/i825xx/3c527.c index ef43f3e951c5..278e791afe00 100644 --- a/drivers/net/ethernet/i825xx/3c527.c +++ b/drivers/net/ethernet/i825xx/3c527.c @@ -106,7 +106,6 @@ DRV_NAME ".c:v" DRV_VERSION " " DRV_RELDATE " Richard Procter #include -#include #include #include diff --git a/drivers/net/ethernet/i825xx/eepro.c b/drivers/net/ethernet/i825xx/eepro.c index 7a4ad4a07917..7f49fd54c521 100644 --- a/drivers/net/ethernet/i825xx/eepro.c +++ b/drivers/net/ethernet/i825xx/eepro.c @@ -148,7 +148,6 @@ static const char version[] = #include #include -#include #include #include diff --git a/drivers/net/ethernet/i825xx/eexpress.c b/drivers/net/ethernet/i825xx/eexpress.c index 3fc649e54a32..cc2e66ad4436 100644 --- a/drivers/net/ethernet/i825xx/eexpress.c +++ b/drivers/net/ethernet/i825xx/eexpress.c @@ -116,7 +116,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/i825xx/ether1.c b/drivers/net/ethernet/i825xx/ether1.c index 406a12b46404..067db3f13e91 100644 --- a/drivers/net/ethernet/i825xx/ether1.c +++ b/drivers/net/ethernet/i825xx/ether1.c @@ -48,7 +48,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/i825xx/znet.c b/drivers/net/ethernet/i825xx/znet.c index a43649735a04..bd1f1ef91e19 100644 --- a/drivers/net/ethernet/i825xx/znet.c +++ b/drivers/net/ethernet/i825xx/znet.c @@ -100,7 +100,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index f30db1c46600..bc58f1dc22f5 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -55,7 +55,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index 75af1afe46c8..5e1ca0f05090 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -57,7 +57,6 @@ #include #include #include -#include static char mv643xx_eth_driver_name[] = "mv643xx_eth"; static char mv643xx_eth_driver_version[] = "1.4"; diff --git a/drivers/net/ethernet/marvell/pxa168_eth.c b/drivers/net/ethernet/marvell/pxa168_eth.c index 45a6333588e6..efec6b60b327 100644 --- a/drivers/net/ethernet/marvell/pxa168_eth.c +++ b/drivers/net/ethernet/marvell/pxa168_eth.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/ethernet/natsemi/jazzsonic.c b/drivers/net/ethernet/natsemi/jazzsonic.c index 5b89fd377ae3..95dd39ffb230 100644 --- a/drivers/net/ethernet/natsemi/jazzsonic.c +++ b/drivers/net/ethernet/natsemi/jazzsonic.c @@ -38,7 +38,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/natsemi/macsonic.c b/drivers/net/ethernet/natsemi/macsonic.c index e640e23460de..b9680ba5a325 100644 --- a/drivers/net/ethernet/natsemi/macsonic.c +++ b/drivers/net/ethernet/natsemi/macsonic.c @@ -53,7 +53,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c index c24b46cbfe27..d52728b3c436 100644 --- a/drivers/net/ethernet/natsemi/ns83820.c +++ b/drivers/net/ethernet/natsemi/ns83820.c @@ -121,7 +121,6 @@ #include #include -#include #define DRV_NAME "ns83820" diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c index 22a8de00bf02..6338ef8606ae 100644 --- a/drivers/net/ethernet/neterion/s2io.c +++ b/drivers/net/ethernet/neterion/s2io.c @@ -81,7 +81,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 8561dd25db66..aca13046e432 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -69,7 +69,6 @@ #include #include -#include #define TX_WORK_PER_LOOP 64 #define RX_WORK_PER_LOOP 64 diff --git a/drivers/net/ethernet/realtek/atp.c b/drivers/net/ethernet/realtek/atp.c index 46c1932048cb..e02f04d7f3ad 100644 --- a/drivers/net/ethernet/realtek/atp.c +++ b/drivers/net/ethernet/realtek/atp.c @@ -140,7 +140,6 @@ static int xcvr[NUM_UNITS]; /* The data transfer mode. */ #include #include -#include #include #include diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 27c358c8f4dc..7b23554f80b6 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -29,7 +29,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/seeq/ether3.c b/drivers/net/ethernet/seeq/ether3.c index 7b819bd8c416..df808ac8cb65 100644 --- a/drivers/net/ethernet/seeq/ether3.c +++ b/drivers/net/ethernet/seeq/ether3.c @@ -64,7 +64,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/seeq/seeq8005.c b/drivers/net/ethernet/seeq/seeq8005.c index 798990774446..698edbbfc149 100644 --- a/drivers/net/ethernet/seeq/seeq8005.c +++ b/drivers/net/ethernet/seeq/seeq8005.c @@ -47,7 +47,6 @@ static const char version[] = #include #include -#include #include #include diff --git a/drivers/net/ethernet/smsc/smc91c92_cs.c b/drivers/net/ethernet/smsc/smc91c92_cs.c index d12e48a7861d..04393b5fef71 100644 --- a/drivers/net/ethernet/smsc/smc91c92_cs.c +++ b/drivers/net/ethernet/smsc/smc91c92_cs.c @@ -53,7 +53,6 @@ #include #include -#include #include /*====================================================================*/ diff --git a/drivers/net/ethernet/sun/cassini.c b/drivers/net/ethernet/sun/cassini.c index 3c2295560732..ce4df61b4b56 100644 --- a/drivers/net/ethernet/sun/cassini.c +++ b/drivers/net/ethernet/sun/cassini.c @@ -99,7 +99,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/sun/sunbmac.c b/drivers/net/ethernet/sun/sunbmac.c index f359863b5340..2a83fc57edba 100644 --- a/drivers/net/ethernet/sun/sunbmac.c +++ b/drivers/net/ethernet/sun/sunbmac.c @@ -35,7 +35,6 @@ #include #include #include -#include #include "sunbmac.h" diff --git a/drivers/net/ethernet/sun/sungem.c b/drivers/net/ethernet/sun/sungem.c index ba041596e046..558409ff4058 100644 --- a/drivers/net/ethernet/sun/sungem.c +++ b/drivers/net/ethernet/sun/sungem.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/sun/sunhme.c b/drivers/net/ethernet/sun/sunhme.c index 8b627e2f798d..b95e7e681b38 100644 --- a/drivers/net/ethernet/sun/sunhme.c +++ b/drivers/net/ethernet/sun/sunhme.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/sun/sunqe.c b/drivers/net/ethernet/sun/sunqe.c index 139d6b410d68..7d4a040d84a2 100644 --- a/drivers/net/ethernet/sun/sunqe.c +++ b/drivers/net/ethernet/sun/sunqe.c @@ -28,7 +28,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/ethernet/tundra/tsi108_eth.c b/drivers/net/ethernet/tundra/tsi108_eth.c index 840e0e9031f5..277c93e9ff4d 100644 --- a/drivers/net/ethernet/tundra/tsi108_eth.c +++ b/drivers/net/ethernet/tundra/tsi108_eth.c @@ -50,7 +50,6 @@ #include #include -#include #include #include diff --git a/drivers/net/ethernet/xircom/xirc2ps_cs.c b/drivers/net/ethernet/xircom/xirc2ps_cs.c index 5c69c6f93fb8..94a1f94f74b8 100644 --- a/drivers/net/ethernet/xircom/xirc2ps_cs.c +++ b/drivers/net/ethernet/xircom/xirc2ps_cs.c @@ -89,7 +89,6 @@ #include #include -#include #include #ifndef MANFID_COMPAQ diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 2a5a34d2d67b..64783a0d545a 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -13,7 +13,6 @@ */ #include -#include #include #include #include diff --git a/drivers/net/hamradio/baycom_par.c b/drivers/net/hamradio/baycom_par.c index f1aea0c98333..acb636963e90 100644 --- a/drivers/net/hamradio/baycom_par.c +++ b/drivers/net/hamradio/baycom_par.c @@ -86,7 +86,6 @@ #include #include -#include #include /* --------------------------------------------------------------------- */ diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index 18d8affecd1b..76d54774ba82 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -69,7 +69,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index bc02968cee16..aed1a6105b24 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -17,7 +17,6 @@ * Copyright (C) 2004, 05 Thomas Osterried DL9SAU */ #include -#include #include #include #include diff --git a/drivers/net/hamradio/scc.c b/drivers/net/hamradio/scc.c index 33655814448e..efc6c97163a7 100644 --- a/drivers/net/hamradio/scc.c +++ b/drivers/net/hamradio/scc.c @@ -177,7 +177,6 @@ #include #include -#include #include #include diff --git a/drivers/net/hamradio/yam.c b/drivers/net/hamradio/yam.c index 696327773fbe..5a6412ecce73 100644 --- a/drivers/net/hamradio/yam.c +++ b/drivers/net/hamradio/yam.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/hippi/rrunner.c b/drivers/net/hippi/rrunner.c index 2a51363d9fed..168c8f41d09f 100644 --- a/drivers/net/hippi/rrunner.c +++ b/drivers/net/hippi/rrunner.c @@ -43,7 +43,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c index 617a446d126c..4351296dde32 100644 --- a/drivers/net/irda/donauboe.c +++ b/drivers/net/irda/donauboe.c @@ -156,7 +156,6 @@ #include #include -#include #include #include diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index b71998d0b5b4..32eb94ece6c1 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -41,7 +41,6 @@ #include #include -#include #include #include diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c index 1a5a316cc968..bed62d9c53c8 100644 --- a/drivers/net/plip/plip.c +++ b/drivers/net/plip/plip.c @@ -113,7 +113,6 @@ static const char version[] = "NET3 PLIP version 2.4-parport gniibe@mri.co.jp\n" #include -#include #include #include diff --git a/drivers/net/slip/slhc.c b/drivers/net/slip/slhc.c index 0a0a6643cf3a..1252d9c726a7 100644 --- a/drivers/net/slip/slhc.c +++ b/drivers/net/slip/slhc.c @@ -75,7 +75,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/slip/slip.c b/drivers/net/slip/slip.c index 69345dfae0fd..d4c9db3da22a 100644 --- a/drivers/net/slip/slip.c +++ b/drivers/net/slip/slip.c @@ -64,7 +64,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/tokenring/3c359.c b/drivers/net/tokenring/3c359.c index d7c292aa76b1..b15ac81d46fa 100644 --- a/drivers/net/tokenring/3c359.c +++ b/drivers/net/tokenring/3c359.c @@ -68,7 +68,6 @@ #include #include -#include #include "3c359.h" diff --git a/drivers/net/tokenring/abyss.c b/drivers/net/tokenring/abyss.c index 515f122777ab..b715e6b444da 100644 --- a/drivers/net/tokenring/abyss.c +++ b/drivers/net/tokenring/abyss.c @@ -33,7 +33,6 @@ #include #include -#include #include #include diff --git a/drivers/net/tokenring/ibmtr_cs.c b/drivers/net/tokenring/ibmtr_cs.c index 91b684630fc5..356e28e4881b 100644 --- a/drivers/net/tokenring/ibmtr_cs.c +++ b/drivers/net/tokenring/ibmtr_cs.c @@ -63,7 +63,6 @@ #include #include -#include #define PCMCIA #include "ibmtr.c" diff --git a/drivers/net/tokenring/lanstreamer.c b/drivers/net/tokenring/lanstreamer.c index 8d71e0d29062..3e4b4f091113 100644 --- a/drivers/net/tokenring/lanstreamer.c +++ b/drivers/net/tokenring/lanstreamer.c @@ -127,7 +127,6 @@ #include #include -#include #include "lanstreamer.h" diff --git a/drivers/net/tokenring/madgemc.c b/drivers/net/tokenring/madgemc.c index 1cdc034f6aec..28adcdf3b14c 100644 --- a/drivers/net/tokenring/madgemc.c +++ b/drivers/net/tokenring/madgemc.c @@ -28,7 +28,6 @@ static const char version[] = "madgemc.c: v0.91 23/01/2000 by Adam Fritzler\n"; #include #include -#include #include #include diff --git a/drivers/net/tokenring/olympic.c b/drivers/net/tokenring/olympic.c index fd8dce90c957..0e234741cc79 100644 --- a/drivers/net/tokenring/olympic.c +++ b/drivers/net/tokenring/olympic.c @@ -106,7 +106,6 @@ #include #include -#include #include "olympic.h" diff --git a/drivers/net/tokenring/proteon.c b/drivers/net/tokenring/proteon.c index 8d362e64a40e..62d90e40f9ec 100644 --- a/drivers/net/tokenring/proteon.c +++ b/drivers/net/tokenring/proteon.c @@ -31,7 +31,6 @@ static const char version[] = "proteon.c: v1.00 02/01/2003 by Jochen Friedrich\n #include #include -#include #include #include #include diff --git a/drivers/net/tokenring/skisa.c b/drivers/net/tokenring/skisa.c index 46db5c5395b2..ee11e93dc30e 100644 --- a/drivers/net/tokenring/skisa.c +++ b/drivers/net/tokenring/skisa.c @@ -38,7 +38,6 @@ static const char version[] = "skisa.c: v1.03 09/12/2002 by Jochen Friedrich\n"; #include #include -#include #include #include #include diff --git a/drivers/net/tokenring/smctr.c b/drivers/net/tokenring/smctr.c index 029846a98636..cb35fb79e016 100644 --- a/drivers/net/tokenring/smctr.c +++ b/drivers/net/tokenring/smctr.c @@ -49,7 +49,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/tokenring/tms380tr.c b/drivers/net/tokenring/tms380tr.c index 102f896bbc58..be4813e0366c 100644 --- a/drivers/net/tokenring/tms380tr.c +++ b/drivers/net/tokenring/tms380tr.c @@ -98,7 +98,6 @@ static const char version[] = "tms380tr.c: v1.10 30/12/2002 by Christoph Goos, A #include #include -#include #include #include #include diff --git a/drivers/net/tokenring/tmspci.c b/drivers/net/tokenring/tmspci.c index d3e788a9cd1c..fb9918da5792 100644 --- a/drivers/net/tokenring/tmspci.c +++ b/drivers/net/tokenring/tmspci.c @@ -34,7 +34,6 @@ #include #include -#include #include #include diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 74d7f76d14a3..bb8c72c79c6f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -69,7 +69,6 @@ #include #include -#include #include /* Uncomment to enable debugging */ diff --git a/drivers/net/wan/dlci.c b/drivers/net/wan/dlci.c index 48ab38a34c5a..147614ed86aa 100644 --- a/drivers/net/wan/dlci.c +++ b/drivers/net/wan/dlci.c @@ -50,7 +50,6 @@ #include -#include #include #include #include diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c index fe8d060d8fff..c676de7de024 100644 --- a/drivers/net/wan/dscc4.c +++ b/drivers/net/wan/dscc4.c @@ -93,7 +93,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/wan/hd64570.c b/drivers/net/wan/hd64570.c index 33b67d88fceb..cf4903355a34 100644 --- a/drivers/net/wan/hd64570.c +++ b/drivers/net/wan/hd64570.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include "hd64570.h" diff --git a/drivers/net/wan/hd64572.c b/drivers/net/wan/hd64572.c index efc0db101183..e2779faa6c4f 100644 --- a/drivers/net/wan/hd64572.c +++ b/drivers/net/wan/hd64572.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include "hd64572.h" diff --git a/drivers/net/wan/lapbether.c b/drivers/net/wan/lapbether.c index 7beeb9b88a3b..a73b49eb87e3 100644 --- a/drivers/net/wan/lapbether.c +++ b/drivers/net/wan/lapbether.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wan/sdla.c b/drivers/net/wan/sdla.c index c8531612eea9..de3bbf43fc5a 100644 --- a/drivers/net/wan/sdla.c +++ b/drivers/net/wan/sdla.c @@ -54,7 +54,6 @@ #include #include -#include #include #include #include diff --git a/drivers/net/wan/x25_asy.c b/drivers/net/wan/x25_asy.c index e862369b4a6d..d7a65e141d1a 100644 --- a/drivers/net/wan/x25_asy.c +++ b/drivers/net/wan/x25_asy.c @@ -18,7 +18,6 @@ #include -#include #include #include #include diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index ddc061dd150c..520a4b2eb9cc 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/drivers/net/wireless/airo_cs.c b/drivers/net/wireless/airo_cs.c index c983c10e0f6a..630577dd3a7a 100644 --- a/drivers/net/wireless/airo_cs.c +++ b/drivers/net/wireless/airo_cs.c @@ -37,7 +37,6 @@ #include #include -#include #include "airo.h" diff --git a/drivers/net/wireless/atmel.c b/drivers/net/wireless/atmel.c index 3010cee7b95a..6c87a823f5a9 100644 --- a/drivers/net/wireless/atmel.c +++ b/drivers/net/wireless/atmel.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/wireless/atmel_cs.c b/drivers/net/wireless/atmel_cs.c index ec295c4f677d..ded03d226a71 100644 --- a/drivers/net/wireless/atmel_cs.c +++ b/drivers/net/wireless/atmel_cs.c @@ -48,7 +48,6 @@ #include #include -#include #include #include "atmel.h" diff --git a/drivers/net/wireless/prism54/islpci_mgt.c b/drivers/net/wireless/prism54/islpci_mgt.c index 851fa10241e1..c5404cb59e08 100644 --- a/drivers/net/wireless/prism54/islpci_mgt.c +++ b/drivers/net/wireless/prism54/islpci_mgt.c @@ -24,7 +24,6 @@ #include #include -#include #include #include "prismcompat.h" diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c index 04fec1fa6e0b..86a738bf591c 100644 --- a/drivers/net/wireless/ray_cs.c +++ b/drivers/net/wireless/ray_cs.c @@ -53,7 +53,6 @@ #include #include -#include #include #include diff --git a/drivers/net/wireless/wl3501_cs.c b/drivers/net/wireless/wl3501_cs.c index 98fbf54f6004..00f6e69c1dcd 100644 --- a/drivers/net/wireless/wl3501_cs.c +++ b/drivers/net/wireless/wl3501_cs.c @@ -53,7 +53,6 @@ #include #include -#include #include "wl3501.h" diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index b764ac22d523..44d01afafe9c 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 7ff10c1e8664..9c5aae52a3b6 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -55,7 +55,6 @@ #include #include -#include #include #include diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c index 95930d016235..1f9e9fefb8e7 100644 --- a/drivers/parisc/iosapic.c +++ b/drivers/parisc/iosapic.c @@ -140,7 +140,6 @@ #include #include #include -#include #include /* read/write functions */ #ifdef CONFIG_SUPERIO #include diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c index d5f3d753a108..320e43a52e9d 100644 --- a/drivers/parisc/lba_pci.c +++ b/drivers/parisc/lba_pci.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include /* for register_parisc_driver() stuff */ diff --git a/drivers/pcmcia/cs.c b/drivers/pcmcia/cs.c index d9ea192c4001..673c14ea11e3 100644 --- a/drivers/pcmcia/cs.c +++ b/drivers/pcmcia/cs.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pcmcia/i82092.c b/drivers/pcmcia/i82092.c index 3e447d0387b7..0b66bfc0e148 100644 --- a/drivers/pcmcia/i82092.c +++ b/drivers/pcmcia/i82092.c @@ -17,7 +17,6 @@ #include -#include #include #include "i82092aa.h" diff --git a/drivers/pcmcia/i82365.c b/drivers/pcmcia/i82365.c index 72a033a2acdb..e6f3d17dd2b4 100644 --- a/drivers/pcmcia/i82365.c +++ b/drivers/pcmcia/i82365.c @@ -48,7 +48,6 @@ #include #include #include -#include #include diff --git a/drivers/pcmcia/m32r_cfc.c b/drivers/pcmcia/m32r_cfc.c index 2adb0106a039..a26f38c6402a 100644 --- a/drivers/pcmcia/m32r_cfc.c +++ b/drivers/pcmcia/m32r_cfc.c @@ -24,7 +24,6 @@ #include #include #include -#include #include diff --git a/drivers/pcmcia/m32r_pcc.c b/drivers/pcmcia/m32r_pcc.c index 1511ff71c87b..296514155cd5 100644 --- a/drivers/pcmcia/m32r_pcc.c +++ b/drivers/pcmcia/m32r_pcc.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pcmcia/m8xx_pcmcia.c b/drivers/pcmcia/m8xx_pcmcia.c index 271a590a5f3c..a317defd616d 100644 --- a/drivers/pcmcia/m8xx_pcmcia.c +++ b/drivers/pcmcia/m8xx_pcmcia.c @@ -52,7 +52,6 @@ #include #include -#include #include #include #include diff --git a/drivers/pcmcia/pd6729.c b/drivers/pcmcia/pd6729.c index 96c72e90b79c..0f8b70b27762 100644 --- a/drivers/pcmcia/pd6729.c +++ b/drivers/pcmcia/pd6729.c @@ -19,7 +19,6 @@ #include -#include #include "pd6729.h" #include "i82365.h" diff --git a/drivers/pcmcia/pxa2xx_base.c b/drivers/pcmcia/pxa2xx_base.c index 64d433ec4fc6..85ac0957dcd0 100644 --- a/drivers/pcmcia/pxa2xx_base.c +++ b/drivers/pcmcia/pxa2xx_base.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pcmcia/sa11xx_base.c b/drivers/pcmcia/sa11xx_base.c index 0c62fe31a40e..79ecc23643ec 100644 --- a/drivers/pcmcia/sa11xx_base.c +++ b/drivers/pcmcia/sa11xx_base.c @@ -41,7 +41,6 @@ #include #include -#include #include "soc_common.h" #include "sa11xx_base.h" diff --git a/drivers/pcmcia/soc_common.c b/drivers/pcmcia/soc_common.c index a0a9c2aa8d78..5d22c6acb8e2 100644 --- a/drivers/pcmcia/soc_common.c +++ b/drivers/pcmcia/soc_common.c @@ -45,7 +45,6 @@ #include #include -#include #include "soc_common.h" diff --git a/drivers/pcmcia/socket_sysfs.c b/drivers/pcmcia/socket_sysfs.c index 71aeed93037c..d6881514d38e 100644 --- a/drivers/pcmcia/socket_sysfs.c +++ b/drivers/pcmcia/socket_sysfs.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pcmcia/tcic.c b/drivers/pcmcia/tcic.c index 310160bffe38..cbe15fc37411 100644 --- a/drivers/pcmcia/tcic.c +++ b/drivers/pcmcia/tcic.c @@ -47,7 +47,6 @@ #include #include -#include #include #include "tcic.h" diff --git a/drivers/pcmcia/xxs1500_ss.c b/drivers/pcmcia/xxs1500_ss.c index 379f4218857d..8f6698074f8e 100644 --- a/drivers/pcmcia/xxs1500_ss.c +++ b/drivers/pcmcia/xxs1500_ss.c @@ -21,7 +21,6 @@ #include #include -#include #include #define MEM_MAP_SIZE 0x400000 diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index b859d16cf78c..769d265b221b 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -17,7 +17,6 @@ #include #include -#include #include #include "pnpbios.h" diff --git a/drivers/pnp/pnpbios/core.c b/drivers/pnp/pnpbios/core.c index cfe86853feb2..9d4222648640 100644 --- a/drivers/pnp/pnpbios/core.c +++ b/drivers/pnp/pnpbios/core.c @@ -65,7 +65,6 @@ #include #include -#include #include #include "../base.h" diff --git a/drivers/s390/crypto/ap_bus.c b/drivers/s390/crypto/ap_bus.c index c5c121f14c99..7e9a72eb2fe0 100644 --- a/drivers/s390/crypto/ap_bus.c +++ b/drivers/s390/crypto/ap_bus.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/sbus/char/flash.c b/drivers/sbus/char/flash.c index 826157f38694..327657e2e264 100644 --- a/drivers/sbus/char/flash.c +++ b/drivers/sbus/char/flash.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include diff --git a/drivers/sbus/char/openprom.c b/drivers/sbus/char/openprom.c index 8d6e508222b8..2236aea3ca2f 100644 --- a/drivers/sbus/char/openprom.c +++ b/drivers/sbus/char/openprom.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #ifdef CONFIG_PCI diff --git a/drivers/sbus/char/uctrl.c b/drivers/sbus/char/uctrl.c index 0b31658ccde5..a9e468cc1cac 100644 --- a/drivers/sbus/char/uctrl.c +++ b/drivers/sbus/char/uctrl.c @@ -19,7 +19,6 @@ #include #include -#include #include #include #include diff --git a/drivers/scsi/53c700.c b/drivers/scsi/53c700.c index f672491774eb..a3adfb4357f5 100644 --- a/drivers/scsi/53c700.c +++ b/drivers/scsi/53c700.c @@ -129,7 +129,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/BusLogic.c b/drivers/scsi/BusLogic.c index f66c33b9ab41..d4da3708763b 100644 --- a/drivers/scsi/BusLogic.c +++ b/drivers/scsi/BusLogic.c @@ -47,7 +47,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/advansys.c b/drivers/scsi/advansys.c index bfd618a69499..374c4edf4fcb 100644 --- a/drivers/scsi/advansys.c +++ b/drivers/scsi/advansys.c @@ -41,7 +41,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/aha152x.c b/drivers/scsi/aha152x.c index f17c92cf808b..19a36945e6fd 100644 --- a/drivers/scsi/aha152x.c +++ b/drivers/scsi/aha152x.c @@ -239,7 +239,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/aha1542.c b/drivers/scsi/aha1542.c index ed119cedaae0..ede91f378000 100644 --- a/drivers/scsi/aha1542.c +++ b/drivers/scsi/aha1542.c @@ -42,7 +42,6 @@ #include #include -#include #include #include "scsi.h" diff --git a/drivers/scsi/aha1740.c b/drivers/scsi/aha1740.c index 1c10b796c1a2..a3e6ed353917 100644 --- a/drivers/scsi/aha1740.c +++ b/drivers/scsi/aha1740.c @@ -53,7 +53,6 @@ #include #include -#include #include #include "scsi.h" diff --git a/drivers/scsi/arcmsr/arcmsr_hba.c b/drivers/scsi/arcmsr/arcmsr_hba.c index 2fe9e90e53d9..cbde1dca45ad 100644 --- a/drivers/scsi/arcmsr/arcmsr_hba.c +++ b/drivers/scsi/arcmsr/arcmsr_hba.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/arm/acornscsi.c b/drivers/scsi/arm/acornscsi.c index c454e44cf51c..b330438ac662 100644 --- a/drivers/scsi/arm/acornscsi.c +++ b/drivers/scsi/arm/acornscsi.c @@ -138,7 +138,6 @@ #include #include -#include #include #include "../scsi.h" diff --git a/drivers/scsi/arm/cumana_1.c b/drivers/scsi/arm/cumana_1.c index a3398fe70a9c..c3b99c93637a 100644 --- a/drivers/scsi/arm/cumana_1.c +++ b/drivers/scsi/arm/cumana_1.c @@ -12,7 +12,6 @@ #include #include -#include #include "../scsi.h" #include diff --git a/drivers/scsi/arm/oak.c b/drivers/scsi/arm/oak.c index 849cdf89f7bb..d25f944b59c2 100644 --- a/drivers/scsi/arm/oak.c +++ b/drivers/scsi/arm/oak.c @@ -13,7 +13,6 @@ #include #include -#include #include "../scsi.h" #include diff --git a/drivers/scsi/atp870u.c b/drivers/scsi/atp870u.c index 7e6eca4a125e..f29d5121d5ed 100644 --- a/drivers/scsi/atp870u.c +++ b/drivers/scsi/atp870u.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include diff --git a/drivers/scsi/dtc.c b/drivers/scsi/dtc.c index c2677ba29c74..4b11bb04f5c4 100644 --- a/drivers/scsi/dtc.c +++ b/drivers/scsi/dtc.c @@ -72,7 +72,6 @@ #endif -#include #include #include #include diff --git a/drivers/scsi/fd_mcs.c b/drivers/scsi/fd_mcs.c index a2c6135d337e..53bfcaa86f09 100644 --- a/drivers/scsi/fd_mcs.c +++ b/drivers/scsi/fd_mcs.c @@ -93,7 +93,6 @@ #include #include -#include #include "scsi.h" #include diff --git a/drivers/scsi/fdomain.c b/drivers/scsi/fdomain.c index 643f6d500fe7..1a2a1e5824e3 100644 --- a/drivers/scsi/fdomain.c +++ b/drivers/scsi/fdomain.c @@ -282,7 +282,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/g_NCR5380.c b/drivers/scsi/g_NCR5380.c index 81182badfeb1..1a5954f0915a 100644 --- a/drivers/scsi/g_NCR5380.c +++ b/drivers/scsi/g_NCR5380.c @@ -100,7 +100,6 @@ #undef NCR5380_STAT_LIMIT #endif -#include #include #include #include diff --git a/drivers/scsi/gdth.c b/drivers/scsi/gdth.c index d42ec921de46..5d72274c507f 100644 --- a/drivers/scsi/gdth.c +++ b/drivers/scsi/gdth.c @@ -129,7 +129,6 @@ #include #include -#include #include #include #include diff --git a/drivers/scsi/ibmmca.c b/drivers/scsi/ibmmca.c index 67fc8ffd52e6..cd09132d5d7d 100644 --- a/drivers/scsi/ibmmca.c +++ b/drivers/scsi/ibmmca.c @@ -32,7 +32,6 @@ #include #include -#include #include #include "scsi.h" diff --git a/drivers/scsi/in2000.c b/drivers/scsi/in2000.c index 112f1bec7756..deb5b6d8398e 100644 --- a/drivers/scsi/in2000.c +++ b/drivers/scsi/in2000.c @@ -123,7 +123,6 @@ #include #include -#include #include "scsi.h" #include diff --git a/drivers/scsi/mac53c94.c b/drivers/scsi/mac53c94.c index e6173376605d..e5cd8d8d4ce7 100644 --- a/drivers/scsi/mac53c94.c +++ b/drivers/scsi/mac53c94.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/drivers/scsi/mac_scsi.c b/drivers/scsi/mac_scsi.c index 2bccfbe5661e..24828b54773a 100644 --- a/drivers/scsi/mac_scsi.c +++ b/drivers/scsi/mac_scsi.c @@ -43,7 +43,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c index 494474779532..e8a04ae3276a 100644 --- a/drivers/scsi/mesh.c +++ b/drivers/scsi/mesh.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/ncr53c8xx.c b/drivers/scsi/ncr53c8xx.c index 4b3b4755945c..5982a587babc 100644 --- a/drivers/scsi/ncr53c8xx.c +++ b/drivers/scsi/ncr53c8xx.c @@ -115,7 +115,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/nsp32.c b/drivers/scsi/nsp32.c index 002924963cd8..62b616891a33 100644 --- a/drivers/scsi/nsp32.c +++ b/drivers/scsi/nsp32.c @@ -38,7 +38,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index de0b1a704fb5..21883a2d6324 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -54,7 +54,6 @@ static const char * osst_version = "0.99.4"; #include #include #include -#include /* The driver prints some debugging information on the console if DEBUG is defined and non-zero. */ diff --git a/drivers/scsi/pas16.c b/drivers/scsi/pas16.c index f2018b46f494..2f72c9807b12 100644 --- a/drivers/scsi/pas16.c +++ b/drivers/scsi/pas16.c @@ -113,7 +113,6 @@ #include -#include #include #include #include diff --git a/drivers/scsi/qla1280.c b/drivers/scsi/qla1280.c index d838205ab169..6c6486f626ee 100644 --- a/drivers/scsi/qla1280.c +++ b/drivers/scsi/qla1280.c @@ -359,7 +359,6 @@ #include #include #include -#include #include #include diff --git a/drivers/scsi/qlogicpti.c b/drivers/scsi/qlogicpti.c index e40dc1cb09a0..b191dd549207 100644 --- a/drivers/scsi/qlogicpti.c +++ b/drivers/scsi/qlogicpti.c @@ -35,7 +35,6 @@ #include "qlogicpti.h" #include -#include #include #include #include diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 9262cdfa4b23..a15f691f9d34 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -42,7 +42,6 @@ static const char *verstr = "20101219"; #include #include -#include #include #include diff --git a/drivers/scsi/sun3_scsi.c b/drivers/scsi/sun3_scsi.c index baf7328de956..6e25889db9d4 100644 --- a/drivers/scsi/sun3_scsi.c +++ b/drivers/scsi/sun3_scsi.c @@ -63,7 +63,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/sun3_scsi_vme.c b/drivers/scsi/sun3_scsi_vme.c index fbba78e5722e..a3dd55d1d2fd 100644 --- a/drivers/scsi/sun3_scsi_vme.c +++ b/drivers/scsi/sun3_scsi_vme.c @@ -25,7 +25,6 @@ #include #include -#include #include #include diff --git a/drivers/scsi/sym53c416.c b/drivers/scsi/sym53c416.c index 012c86edd59f..ac4eca6a5328 100644 --- a/drivers/scsi/sym53c416.c +++ b/drivers/scsi/sym53c416.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/t128.c b/drivers/scsi/t128.c index 041eaaace2c3..d672d97fb84a 100644 --- a/drivers/scsi/t128.c +++ b/drivers/scsi/t128.c @@ -106,7 +106,6 @@ * $Log: t128.c,v $ */ -#include #include #include #include diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c index 90e104d6b558..9c216e563568 100644 --- a/drivers/scsi/u14-34f.c +++ b/drivers/scsi/u14-34f.c @@ -410,7 +410,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/ultrastor.c b/drivers/scsi/ultrastor.c index 7e22b737dfd8..14e0c40a68c9 100644 --- a/drivers/scsi/ultrastor.c +++ b/drivers/scsi/ultrastor.c @@ -141,7 +141,6 @@ #include #include -#include #include #define ULTRASTOR_PRIVATE /* Get the private stuff from ultrastor.h */ diff --git a/drivers/scsi/wd7000.c b/drivers/scsi/wd7000.c index 9ee0afef2d16..d89a5dfd3ade 100644 --- a/drivers/scsi/wd7000.c +++ b/drivers/scsi/wd7000.c @@ -179,7 +179,6 @@ #include #include -#include #include #include diff --git a/drivers/spi/spi-omap-uwire.c b/drivers/spi/spi-omap-uwire.c index 610f7391456e..9b0d71696039 100644 --- a/drivers/spi/spi-omap-uwire.c +++ b/drivers/spi/spi-omap-uwire.c @@ -47,7 +47,6 @@ #include #include -#include #include #include #include diff --git a/drivers/staging/comedi/drivers.c b/drivers/staging/comedi/drivers.c index db1fd63aaab3..bf185e2807d1 100644 --- a/drivers/staging/comedi/drivers.c +++ b/drivers/staging/comedi/drivers.c @@ -42,7 +42,6 @@ #include #include #include -#include #include "comedidev.h" #include "internal.h" diff --git a/drivers/staging/comedi/drivers/cb_pcidas64.c b/drivers/staging/comedi/drivers/cb_pcidas64.c index c9e8c4785768..915157d47805 100644 --- a/drivers/staging/comedi/drivers/cb_pcidas64.c +++ b/drivers/staging/comedi/drivers/cb_pcidas64.c @@ -86,7 +86,6 @@ TODO: #include "../comedidev.h" #include #include -#include #include "comedi_pci.h" #include "8253.h" diff --git a/drivers/staging/comedi/drivers/mite.c b/drivers/staging/comedi/drivers/mite.c index fd274e9c7b78..13e9c8071696 100644 --- a/drivers/staging/comedi/drivers/mite.c +++ b/drivers/staging/comedi/drivers/mite.c @@ -55,7 +55,6 @@ #include "comedi_pci.h" #include "../comedidev.h" -#include #define PCI_MITE_SIZE 4096 #define PCI_DAQ_SIZE 4096 diff --git a/drivers/staging/crystalhd/crystalhd.h b/drivers/staging/crystalhd/crystalhd.h index 3f4d79515026..b3a550bd5b06 100644 --- a/drivers/staging/crystalhd/crystalhd.h +++ b/drivers/staging/crystalhd/crystalhd.h @@ -1,7 +1,6 @@ #ifndef _CRYSTALHD_H_ #define _CRYSTALHD_H_ -#include #include "bc_dts_defs.h" #include "crystalhd_misc.h" #include "bc_dts_glob_lnx.h" diff --git a/drivers/staging/crystalhd/crystalhd_lnx.h b/drivers/staging/crystalhd/crystalhd_lnx.h index a81f9298b0a1..a9e36336d097 100644 --- a/drivers/staging/crystalhd/crystalhd_lnx.h +++ b/drivers/staging/crystalhd/crystalhd_lnx.h @@ -45,7 +45,6 @@ #include #include #include -#include #include #include "crystalhd.h" diff --git a/drivers/staging/crystalhd/crystalhd_misc.h b/drivers/staging/crystalhd/crystalhd_misc.h index 84c87938a831..8cdaa7a34814 100644 --- a/drivers/staging/crystalhd/crystalhd_misc.h +++ b/drivers/staging/crystalhd/crystalhd_misc.h @@ -37,6 +37,7 @@ #include #include #include +#include "bc_dts_glob_lnx.h" /* Global log level variable defined in crystal_misc.c file */ extern uint32_t g_linklog_level; diff --git a/drivers/staging/et131x/et131x.c b/drivers/staging/et131x/et131x.c index 3f919babe79b..886f5650444e 100644 --- a/drivers/staging/et131x/et131x.c +++ b/drivers/staging/et131x/et131x.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include diff --git a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c index 7569aa0f24d1..c4a8a0a26eb5 100644 --- a/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c +++ b/drivers/staging/ft1000/ft1000-pcmcia/ft1000_hw.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/drivers/staging/media/go7007/go7007-driver.c b/drivers/staging/media/go7007/go7007-driver.c index 6c9279a6d606..ece2dd146487 100644 --- a/drivers/staging/media/go7007/go7007-driver.c +++ b/drivers/staging/media/go7007/go7007-driver.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/staging/media/go7007/go7007-i2c.c b/drivers/staging/media/go7007/go7007-i2c.c index b8cfa1a6eaeb..6bc82aaeef11 100644 --- a/drivers/staging/media/go7007/go7007-i2c.c +++ b/drivers/staging/media/go7007/go7007-i2c.c @@ -26,7 +26,6 @@ #include #include #include -#include #include "go7007-priv.h" #include "wis-i2c.h" diff --git a/drivers/staging/media/go7007/go7007-v4l2.c b/drivers/staging/media/go7007/go7007-v4l2.c index 2b27d8da70a2..c8c96e26bc72 100644 --- a/drivers/staging/media/go7007/go7007-v4l2.c +++ b/drivers/staging/media/go7007/go7007-v4l2.c @@ -34,7 +34,6 @@ #include #include #include -#include #include "go7007.h" #include "go7007-priv.h" diff --git a/drivers/staging/media/go7007/snd-go7007.c b/drivers/staging/media/go7007/snd-go7007.c index d071c838ac2a..5af29ff68bfd 100644 --- a/drivers/staging/media/go7007/snd-go7007.c +++ b/drivers/staging/media/go7007/snd-go7007.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/staging/media/lirc/lirc_serial.c b/drivers/staging/media/lirc/lirc_serial.c index 8dd8897ad860..18a798c27a7d 100644 --- a/drivers/staging/media/lirc/lirc_serial.c +++ b/drivers/staging/media/lirc/lirc_serial.c @@ -66,7 +66,6 @@ #include #include -#include #include #include #include diff --git a/drivers/staging/media/lirc/lirc_sir.c b/drivers/staging/media/lirc/lirc_sir.c index c94382b917ac..945d9623550b 100644 --- a/drivers/staging/media/lirc/lirc_sir.c +++ b/drivers/staging/media/lirc/lirc_sir.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/staging/panel/panel.c b/drivers/staging/panel/panel.c index 4683d5f355c0..6183573f112f 100644 --- a/drivers/staging/panel/panel.c +++ b/drivers/staging/panel/panel.c @@ -58,7 +58,6 @@ #include #include -#include #define LCD_MINOR 156 #define KEYPAD_MINOR 185 diff --git a/drivers/staging/sbe-2t3e3/io.c b/drivers/staging/sbe-2t3e3/io.c index b458ff034067..9a50bcc59594 100644 --- a/drivers/staging/sbe-2t3e3/io.c +++ b/drivers/staging/sbe-2t3e3/io.c @@ -11,7 +11,6 @@ */ #include -#include #include "2t3e3.h" #include "ctrl.h" diff --git a/drivers/staging/telephony/phonedev.c b/drivers/staging/telephony/phonedev.c index 1915af201175..1dd0b6717ccc 100644 --- a/drivers/staging/telephony/phonedev.c +++ b/drivers/staging/telephony/phonedev.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include diff --git a/drivers/staging/tidspbridge/include/dspbridge/host_os.h b/drivers/staging/tidspbridge/include/dspbridge/host_os.h index a2f31c69d12e..ed00d3da3205 100644 --- a/drivers/staging/tidspbridge/include/dspbridge/host_os.h +++ b/drivers/staging/tidspbridge/include/dspbridge/host_os.h @@ -17,7 +17,6 @@ #ifndef _HOST_OS_H_ #define _HOST_OS_H_ -#include #include #include #include diff --git a/drivers/staging/wlags49_h2/wl_cs.c b/drivers/staging/wlags49_h2/wl_cs.c index a2cbb29c3f59..7084f414846e 100644 --- a/drivers/staging/wlags49_h2/wl_cs.c +++ b/drivers/staging/wlags49_h2/wl_cs.c @@ -74,7 +74,6 @@ #include #include #include -#include #include #include diff --git a/drivers/staging/wlags49_h2/wl_main.c b/drivers/staging/wlags49_h2/wl_main.c index dab603e0f452..d5bf0a7012f2 100644 --- a/drivers/staging/wlags49_h2/wl_main.c +++ b/drivers/staging/wlags49_h2/wl_main.c @@ -86,8 +86,7 @@ // #include // #include // #include -// #include -// #include +// // #include #include #include diff --git a/drivers/staging/wlags49_h2/wl_netdev.c b/drivers/staging/wlags49_h2/wl_netdev.c index 9c16f5478a75..90820ff1aced 100644 --- a/drivers/staging/wlags49_h2/wl_netdev.c +++ b/drivers/staging/wlags49_h2/wl_netdev.c @@ -79,8 +79,7 @@ // #include // #include // #include -// #include -// #include +// // #include #include #include diff --git a/drivers/staging/wlags49_h2/wl_pci.c b/drivers/staging/wlags49_h2/wl_pci.c index 2bd9b84ace8e..3df990c7306a 100644 --- a/drivers/staging/wlags49_h2/wl_pci.c +++ b/drivers/staging/wlags49_h2/wl_pci.c @@ -77,7 +77,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/staging/wlags49_h2/wl_util.c b/drivers/staging/wlags49_h2/wl_util.c index b748a3ff7954..f104e6f1e980 100644 --- a/drivers/staging/wlags49_h2/wl_util.c +++ b/drivers/staging/wlags49_h2/wl_util.c @@ -73,8 +73,7 @@ // #include // #include // #include -// #include -// #include +// // #include #include #include diff --git a/drivers/tty/amiserial.c b/drivers/tty/amiserial.c index afadcd43d14e..24145c30c9b0 100644 --- a/drivers/tty/amiserial.c +++ b/drivers/tty/amiserial.c @@ -85,7 +85,6 @@ static char *serial_version = "4.30"; #include -#include #include diff --git a/drivers/tty/isicom.c b/drivers/tty/isicom.c index 03c14979accf..794ecb40017c 100644 --- a/drivers/tty/isicom.c +++ b/drivers/tty/isicom.c @@ -133,7 +133,6 @@ #include #include -#include #include diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c index 8a8d0440bab0..324467d28a54 100644 --- a/drivers/tty/moxa.c +++ b/drivers/tty/moxa.c @@ -46,7 +46,6 @@ #include #include -#include #include #include diff --git a/drivers/tty/mxser.c b/drivers/tty/mxser.c index 17ff377e4129..c6f372dd5623 100644 --- a/drivers/tty/mxser.c +++ b/drivers/tty/mxser.c @@ -41,7 +41,6 @@ #include #include -#include #include #include #include diff --git a/drivers/tty/n_hdlc.c b/drivers/tty/n_hdlc.c index a09ce3ef5d74..1b2db9a3038c 100644 --- a/drivers/tty/n_hdlc.c +++ b/drivers/tty/n_hdlc.c @@ -102,7 +102,6 @@ #include #include -#include #include #include diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c index d2256d08ee7e..94b6eda87afd 100644 --- a/drivers/tty/n_tty.c +++ b/drivers/tty/n_tty.c @@ -50,7 +50,6 @@ #include #include -#include /* number of characters left in xmit buffer before select has we have room */ #define WAKEUP_CHARS 256 diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c index f96ecaec24f8..eeae7fafe9a7 100644 --- a/drivers/tty/pty.c +++ b/drivers/tty/pty.c @@ -27,7 +27,6 @@ #include #include -#include #ifdef CONFIG_UNIX98_PTYS static struct tty_driver *ptm_driver; diff --git a/drivers/tty/serial/68328serial.c b/drivers/tty/serial/68328serial.c index 7398390e7e65..5ce782529d65 100644 --- a/drivers/tty/serial/68328serial.c +++ b/drivers/tty/serial/68328serial.c @@ -39,7 +39,6 @@ #include #include -#include #include #include diff --git a/drivers/tty/serial/8250/serial_cs.c b/drivers/tty/serial/8250/serial_cs.c index 86090605a84e..29b695d041ec 100644 --- a/drivers/tty/serial/8250/serial_cs.c +++ b/drivers/tty/serial/8250/serial_cs.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include diff --git a/drivers/tty/serial/crisv10.c b/drivers/tty/serial/crisv10.c index 90280ae18238..5b07c0c3a10c 100644 --- a/drivers/tty/serial/crisv10.c +++ b/drivers/tty/serial/crisv10.c @@ -34,7 +34,6 @@ static char *serial_version = "$Revision: 1.25 $"; #include #include -#include #include #include diff --git a/drivers/tty/serial/dz.c b/drivers/tty/serial/dz.c index e3699a84049f..6491b8644a7f 100644 --- a/drivers/tty/serial/dz.c +++ b/drivers/tty/serial/dz.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include diff --git a/drivers/tty/serial/icom.c b/drivers/tty/serial/icom.c index d55709a7a75a..defc4e3393a3 100644 --- a/drivers/tty/serial/icom.c +++ b/drivers/tty/serial/icom.c @@ -52,7 +52,6 @@ #include #include -#include #include #include #include diff --git a/drivers/tty/serial/msm_serial_hs.c b/drivers/tty/serial/msm_serial_hs.c index 5e85e1e14c44..fca13dc73e23 100644 --- a/drivers/tty/serial/msm_serial_hs.c +++ b/drivers/tty/serial/msm_serial_hs.c @@ -50,7 +50,6 @@ #include #include -#include #include #include diff --git a/drivers/tty/serial/zs.c b/drivers/tty/serial/zs.c index b7455b526080..4001eee6c08d 100644 --- a/drivers/tty/serial/zs.c +++ b/drivers/tty/serial/zs.c @@ -67,7 +67,6 @@ #include #include -#include #include #include diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c index 8e518da85fd5..593d40ad0a6b 100644 --- a/drivers/tty/synclink.c +++ b/drivers/tty/synclink.c @@ -86,7 +86,6 @@ #include #include -#include #include #include #include diff --git a/drivers/tty/synclink_gt.c b/drivers/tty/synclink_gt.c index 34b1a3c43066..aa1debf97cc7 100644 --- a/drivers/tty/synclink_gt.c +++ b/drivers/tty/synclink_gt.c @@ -73,7 +73,6 @@ #include #include -#include #include #include #include diff --git a/drivers/tty/synclinkmp.c b/drivers/tty/synclinkmp.c index 4fb6c4b31b79..a3dddc12d2fe 100644 --- a/drivers/tty/synclinkmp.c +++ b/drivers/tty/synclinkmp.c @@ -58,7 +58,6 @@ #include #include -#include #include #include #include diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index dd8a938510ca..d939bd705c71 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -97,7 +97,6 @@ #include #include -#include #include #include diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c index 9314d93c1a20..a1b9a2f68567 100644 --- a/drivers/tty/tty_ioctl.c +++ b/drivers/tty/tty_ioctl.c @@ -23,7 +23,6 @@ #include #include -#include #undef TTY_DEBUG_WAIT_UNTIL_SENT diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 84c4a7d5603e..3bdd4b19dd06 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -99,7 +99,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index 2204a4c68d85..77779271f487 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -54,7 +54,6 @@ #include #include -#include #include /* gadget stack */ diff --git a/drivers/usb/gadget/at91_udc.c b/drivers/usb/gadget/at91_udc.c index 15a8cdb2ded5..d03ab95fcc8d 100644 --- a/drivers/usb/gadget/at91_udc.c +++ b/drivers/usb/gadget/at91_udc.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c index e1cd56c5e2a8..a6dfd2164166 100644 --- a/drivers/usb/gadget/dummy_hcd.c +++ b/drivers/usb/gadget/dummy_hcd.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #define DRIVER_DESC "USB Host+Gadget Emulator" diff --git a/drivers/usb/gadget/fsl_udc_core.c b/drivers/usb/gadget/fsl_udc_core.c index b30e21fdbb1b..5f94e79cd6b9 100644 --- a/drivers/usb/gadget/fsl_udc_core.c +++ b/drivers/usb/gadget/fsl_udc_core.c @@ -43,7 +43,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c index e1dfd32dc805..e151d6b87dee 100644 --- a/drivers/usb/gadget/goku_udc.c +++ b/drivers/usb/gadget/goku_udc.c @@ -43,7 +43,6 @@ #include #include #include -#include #include diff --git a/drivers/usb/gadget/langwell_udc.c b/drivers/usb/gadget/langwell_udc.c index edd52d963f14..f9cedd52cf20 100644 --- a/drivers/usb/gadget/langwell_udc.c +++ b/drivers/usb/gadget/langwell_udc.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include "langwell_udc.h" diff --git a/drivers/usb/gadget/mv_udc_core.c b/drivers/usb/gadget/mv_udc_core.c index 19bbe80c2f8c..a73cf406e2a4 100644 --- a/drivers/usb/gadget/mv_udc_core.c +++ b/drivers/usb/gadget/mv_udc_core.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include "mv_udc.h" diff --git a/drivers/usb/gadget/net2272.c b/drivers/usb/gadget/net2272.c index 01ae56f47174..43ac7482fa91 100644 --- a/drivers/usb/gadget/net2272.c +++ b/drivers/usb/gadget/net2272.c @@ -42,7 +42,6 @@ #include #include -#include #include #include "net2272.h" diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c index a5ccabc37f30..ac335af154ba 100644 --- a/drivers/usb/gadget/net2280.c +++ b/drivers/usb/gadget/net2280.c @@ -59,7 +59,6 @@ #include #include #include -#include #include diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c index b44830df593e..3b4b6dd0f95a 100644 --- a/drivers/usb/gadget/omap_udc.c +++ b/drivers/usb/gadget/omap_udc.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c index d83134b0f78a..4e4dc1f5f388 100644 --- a/drivers/usb/gadget/printer.c +++ b/drivers/usb/gadget/printer.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/gadget/pxa25x_udc.c b/drivers/usb/gadget/pxa25x_udc.c index 1b33634f2736..41ed69c96d8c 100644 --- a/drivers/usb/gadget/pxa25x_udc.c +++ b/drivers/usb/gadget/pxa25x_udc.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/gadget/rndis.c b/drivers/usb/gadget/rndis.c index d3cdffea9c8a..73a934a170d1 100644 --- a/drivers/usb/gadget/rndis.c +++ b/drivers/usb/gadget/rndis.c @@ -34,7 +34,6 @@ #include #include -#include #include diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c index ab9c65e2c1d5..195524cde6c3 100644 --- a/drivers/usb/gadget/s3c2410_udc.c +++ b/drivers/usb/gadget/s3c2410_udc.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index aede6374e4b6..057cdda7a489 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #if defined(CONFIG_PPC_PS3) diff --git a/drivers/usb/host/isp116x-hcd.c b/drivers/usb/host/isp116x-hcd.c index 924880087a74..9e65e3091c8a 100644 --- a/drivers/usb/host/isp116x-hcd.c +++ b/drivers/usb/host/isp116x-hcd.c @@ -70,7 +70,6 @@ #include #include -#include #include #include "isp116x.h" diff --git a/drivers/usb/host/isp1362-hcd.c b/drivers/usb/host/isp1362-hcd.c index 9e63cdf1ab75..2ed112d3e159 100644 --- a/drivers/usb/host/isp1362-hcd.c +++ b/drivers/usb/host/isp1362-hcd.c @@ -84,7 +84,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c index cd5e382db89c..788ff07aa7c1 100644 --- a/drivers/usb/host/ohci-hcd.c +++ b/drivers/usb/host/ohci-hcd.c @@ -42,7 +42,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c index 015c7c62ed49..3b38030b02a8 100644 --- a/drivers/usb/host/oxu210hp-hcd.c +++ b/drivers/usb/host/oxu210hp-hcd.c @@ -40,7 +40,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c index 2a2cce2d2fa7..91ce1c02e617 100644 --- a/drivers/usb/host/sl811-hcd.c +++ b/drivers/usb/host/sl811-hcd.c @@ -51,7 +51,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c index 16dd6a6abf00..dbbd1ba25224 100644 --- a/drivers/usb/host/u132-hcd.c +++ b/drivers/usb/host/u132-hcd.c @@ -55,7 +55,6 @@ #include #include #include -#include #include /* FIXME ohci.h is ONLY for internal use by the OHCI driver. diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c index e37dea87bb56..e4db350602b8 100644 --- a/drivers/usb/host/uhci-hcd.c +++ b/drivers/usb/host/uhci-hcd.c @@ -45,7 +45,6 @@ #include #include #include -#include #include "uhci-hcd.h" diff --git a/drivers/video/amifb.c b/drivers/video/amifb.c index f23cae094f1b..887df9d81422 100644 --- a/drivers/video/amifb.c +++ b/drivers/video/amifb.c @@ -53,7 +53,6 @@ #include #include -#include #include #include #include diff --git a/drivers/video/bt431.h b/drivers/video/bt431.h index c826f2787bad..04e0cfbba538 100644 --- a/drivers/video/bt431.h +++ b/drivers/video/bt431.h @@ -8,7 +8,6 @@ * archive for more details. */ #include -#include /* * Bt431 cursor generator registers, 32-bit aligned. diff --git a/drivers/video/bt455.h b/drivers/video/bt455.h index b7591fea7add..80f61b03e9ae 100644 --- a/drivers/video/bt455.h +++ b/drivers/video/bt455.h @@ -8,7 +8,6 @@ * archive for more details. */ #include -#include /* * Bt455 byte-wide registers, 32-bit aligned. diff --git a/drivers/video/console/fbcon.c b/drivers/video/console/fbcon.c index 8745637e4b7e..2e471c22abf5 100644 --- a/drivers/video/console/fbcon.c +++ b/drivers/video/console/fbcon.c @@ -77,7 +77,6 @@ #include /* For counting font checksums */ #include #include -#include #include "fbcon.h" diff --git a/drivers/video/console/newport_con.c b/drivers/video/console/newport_con.c index a122d9287d16..6d1596629040 100644 --- a/drivers/video/console/newport_con.c +++ b/drivers/video/console/newport_con.c @@ -22,7 +22,6 @@ #include #include -#include #include #include #include diff --git a/drivers/video/cyber2000fb.c b/drivers/video/cyber2000fb.c index 850380795b05..c1527f5b47ee 100644 --- a/drivers/video/cyber2000fb.c +++ b/drivers/video/cyber2000fb.c @@ -51,7 +51,6 @@ #include #include -#include #ifdef __arm__ #include diff --git a/drivers/video/dnfb.c b/drivers/video/dnfb.c index ec56d2544c73..49e3dda1a361 100644 --- a/drivers/video/dnfb.c +++ b/drivers/video/dnfb.c @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/drivers/video/neofb.c b/drivers/video/neofb.c index fb3f67391105..afc9521173ef 100644 --- a/drivers/video/neofb.c +++ b/drivers/video/neofb.c @@ -71,7 +71,6 @@ #include #include #include -#include #ifdef CONFIG_MTRR #include diff --git a/drivers/video/pmag-ba-fb.c b/drivers/video/pmag-ba-fb.c index 0c69fa20251b..9b4a60b52a4c 100644 --- a/drivers/video/pmag-ba-fb.c +++ b/drivers/video/pmag-ba-fb.c @@ -33,7 +33,6 @@ #include #include -#include #include