diff options
Diffstat (limited to 'kernel')
47 files changed, 1428 insertions, 496 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index bab1dffe37e9..42423665660a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o +obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/exit.c b/kernel/exit.c index 6686ed1e4aa3..abf9cf3b95c6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) */ if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && (tsk->parent_exec_id != tsk->real_parent->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) && - !capable(CAP_KILL)) + tsk->self_exec_id != tsk->parent_exec_id)) tsk->exit_signal = SIGCHLD; signal = tracehook_notify_death(tsk, &cookie, group_dead); @@ -924,6 +923,8 @@ NORET_TYPE void do_exit(long code) schedule(); } + exit_irq_thread(); + exit_signals(tsk); /* sets PF_EXITING */ /* * tsk->flags are checked in the futex code to protect against diff --git a/kernel/fork.c b/kernel/fork.c index 660c2b8765bc..b9e2edd00726 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -645,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; +#ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +#endif tsk->mm = NULL; tsk->active_mm = NULL; @@ -797,6 +800,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) sig->cputime_expires.virt_exp = cputime_zero; sig->cputime_expires.sched_exp = 0; + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + sig->cputime_expires.prof_exp = + secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + sig->cputimer.running = 1; + } + /* The timer lists. */ INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); @@ -812,11 +821,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_inc(¤t->signal->live); return 0; } - sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); - - if (sig) - posix_cpu_timers_init_group(sig); + sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; @@ -856,6 +862,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); + posix_cpu_timers_init_group(sig); + acct_init_pacct(&sig->pacct); tty_audit_fork(sig); @@ -1032,11 +1040,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->default_timer_slack_ns = current->timer_slack_ns; -#ifdef CONFIG_DETECT_SOFTLOCKUP - p->last_switch_count = 0; - p->last_switch_timestamp = 0; -#endif - task_io_accounting_init(&p->ioac); acct_clear_integrals(p); diff --git a/kernel/futex.c b/kernel/futex.c index 6b50a024bca2..eef8cd26b5e5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -883,7 +883,12 @@ retry_private: out_unlock: double_unlock_hb(hb1, hb2); - /* drop_futex_key_refs() must be called outside the spinlocks. */ + /* + * drop_futex_key_refs() must be called outside the spinlocks. During + * the requeue we moved futex_q's from the hash bucket at key1 to the + * one at key2 and updated their key pointer. We no longer need to + * hold the references to key1. + */ while (--drop_count >= 0) drop_futex_key_refs(&key1); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index f394d2a42ca3..cb8a15c19583 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) * and expiry check is done in the hrtimer_interrupt or in the softirq. */ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) + struct hrtimer_clock_base *base, + int wakeup) { if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { - spin_unlock(&base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - spin_lock(&base->cpu_base->lock); + if (wakeup) { + spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + spin_lock(&base->cpu_base->lock); + } else + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + return 1; } + return 0; } @@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) + struct hrtimer_clock_base *base, + int wakeup) { return 0; } @@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) return 0; } -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns, - const enum hrtimer_mode mode) +int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode, + int wakeup) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n * XXX send_remote_softirq() ? */ if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) - hrtimer_enqueue_reprogram(timer, new_base); + hrtimer_enqueue_reprogram(timer, new_base, wakeup); unlock_hrtimer_base(timer, &flags); return ret; } + +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); +} EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** @@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); int hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { - return hrtimer_start_range_ns(timer, tim, 0, mode); + return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); } EXPORT_SYMBOL_GPL(hrtimer_start); diff --git a/kernel/hung_task.c b/kernel/hung_task.c new file mode 100644 index 000000000000..022a4927b785 --- /dev/null +++ b/kernel/hung_task.c @@ -0,0 +1,217 @@ +/* + * Detect Hung Task + * + * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state + * + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/nmi.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/lockdep.h> +#include <linux/module.h> +#include <linux/sysctl.h> + +/* + * The number of tasks checked: + */ +unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; + +/* + * Limit number of tasks checked in a batch. + * + * This value controls the preemptibility of khungtaskd since preemption + * is disabled during the critical section. It also controls the size of + * the RCU grace period. So it needs to be upper-bound. + */ +#define HUNG_TASK_BATCHING 1024 + +/* + * Zero means infinite timeout - no checking done: + */ +unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; + +unsigned long __read_mostly sysctl_hung_task_warnings = 10; + +static int __read_mostly did_panic; + +static struct task_struct *watchdog_task; + +/* + * Should we panic (and reboot, if panic_timeout= is set) when a + * hung task is detected: + */ +unsigned int __read_mostly sysctl_hung_task_panic = + CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + +static int __init hung_task_panic_setup(char *str) +{ + sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("hung_task_panic=", hung_task_panic_setup); + +static int +hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = hung_task_panic, +}; + +static void check_hung_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + + /* + * Ensure the task is not frozen. + * Also, when a freshly created task is scheduled once, changes + * its state to TASK_UNINTERRUPTIBLE without having ever been + * switched out once, it musn't be checked. + */ + if (unlikely(t->flags & PF_FROZEN || !switch_count)) + return; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + return; + } + if (!sysctl_hung_task_warnings) + return; + sysctl_hung_task_warnings--; + + /* + * Ok, the task did not get scheduled for more than 2 minutes, + * complain: + */ + printk(KERN_ERR "INFO: task %s:%d blocked for more than " + "%ld seconds.\n", t->comm, t->pid, timeout); + printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + __debug_show_held_locks(t); + + touch_nmi_watchdog(); + + if (sysctl_hung_task_panic) + panic("hung_task: blocked tasks"); +} + +/* + * To avoid extending the RCU grace period for an unbounded amount of time, + * periodically exit the critical section and enter a new one. + * + * For preemptible RCU it is sufficient to call rcu_read_unlock in order + * exit the grace period. For classic RCU, a reschedule is required. + */ +static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +{ + get_task_struct(g); + get_task_struct(t); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + put_task_struct(t); + put_task_struct(g); +} + +/* + * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for + * a really long time (120 seconds). If that happens, print out + * a warning. + */ +static void check_hung_uninterruptible_tasks(unsigned long timeout) +{ + int max_count = sysctl_hung_task_check_count; + int batch_count = HUNG_TASK_BATCHING; + struct task_struct *g, *t; + + /* + * If the system crashed already then all bets are off, + * do not report extra hung tasks: + */ + if (test_taint(TAINT_DIE) || did_panic) + return; + + rcu_read_lock(); + do_each_thread(g, t) { + if (!--max_count) + goto unlock; + if (!--batch_count) { + batch_count = HUNG_TASK_BATCHING; + rcu_lock_break(g, t); + /* Exit if t or g was unhashed during refresh. */ + if (t->state == TASK_DEAD || g->state == TASK_DEAD) + goto unlock; + } + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) + check_hung_task(t, timeout); + } while_each_thread(g, t); + unlock: + rcu_read_unlock(); +} + +static unsigned long timeout_jiffies(unsigned long timeout) +{ + /* timeout of 0 will disable the watchdog */ + return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; +} + +/* + * Process updating of timeout sysctl + */ +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + if (ret || !write) + goto out; + + wake_up_process(watchdog_task); + + out: + return ret; +} + +/* + * kthread which checks for tasks stuck in D state + */ +static int watchdog(void *dummy) +{ + set_user_nice(current, 0); + + for ( ; ; ) { + unsigned long timeout = sysctl_hung_task_timeout_secs; + + while (schedule_timeout_interruptible(timeout_jiffies(timeout))) + timeout = sysctl_hung_task_timeout_secs; + + check_hung_uninterruptible_tasks(timeout); + } + + return 0; +} + +static int __init hung_task_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + + return 0; +} + +module_init(hung_task_init); diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 38a25b8d8bff..d06df9c41cba 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data) } /** - * devm_request_irq - allocate an interrupt line for a managed device + * devm_request_threaded_irq - allocate an interrupt line for a managed device * @dev: device to request interrupt for * @irq: Interrupt line to allocate * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function @@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * If an IRQ allocated with this function needs to be freed * separately, dev_free_irq() must be used. */ -int devm_request_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) { struct irq_devres *dr; int rc; @@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq, if (!dr) return -ENOMEM; - rc = request_irq(irq, handler, irqflags, devname, dev_id); + rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname, + dev_id); if (rc) { devres_free(dr); return rc; @@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq, return 0; } -EXPORT_SYMBOL(devm_request_irq); +EXPORT_SYMBOL(devm_request_threaded_irq); /** * devm_free_irq - free an interrupt diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 343acecae629..d82142be8dd2 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -339,6 +339,15 @@ irqreturn_t no_action(int cpl, void *dev_id) return IRQ_NONE; } +static void warn_no_thread(unsigned int irq, struct irqaction *action) +{ + if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags)) + return; + + printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD " + "but no thread function available.", irq, action->name); +} + DEFINE_TRACE(irq_handler_entry); DEFINE_TRACE(irq_handler_exit); @@ -363,8 +372,47 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action) trace_irq_handler_entry(irq, action); ret = action->handler(irq, action->dev_id); trace_irq_handler_exit(irq, action, ret); - if (ret == IRQ_HANDLED) + + switch (ret) { + case IRQ_WAKE_THREAD: + /* + * Set result to handled so the spurious check + * does not trigger. + */ + ret = IRQ_HANDLED; + + /* + * Catch drivers which return WAKE_THREAD but + * did not set up a thread function + */ + if (unlikely(!action->thread_fn)) { + warn_no_thread(irq, action); + break; + } + + /* + * Wake up the handler thread for this + * action. In case the thread crashed and was + * killed we just pretend that we handled the + * interrupt. The hardirq handler above has + * disabled the device interrupt, so no irq + * storm is lurking. + */ + if (likely(!test_bit(IRQTF_DIED, + &action->thread_flags))) { + set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + wake_up_process(action->thread); + } + + /* Fall through to add to randomness */ + case IRQ_HANDLED: status |= action->flags; + break; + + default: + break; + } + retval |= ret; action = action->next; } while (action); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1516ab77355c..7e2e7dd4cd2f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -8,16 +8,15 @@ */ #include <linux/irq.h> +#include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include <linux/slab.h> +#include <linux/sched.h> #include "internals.h" -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) -cpumask_var_t irq_default_affinity; - /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * @irq: interrupt number to wait for @@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq) /* Oops, that failed? */ } while (status & IRQ_INPROGRESS); + + /* + * We made sure that no hardirq handler is running. Now verify + * that no threaded handlers are active. + */ + wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); } EXPORT_SYMBOL(synchronize_irq); +#ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; + /** * irq_can_set_affinity - Check if the affinity of a given irq can be set * @irq: Interrupt to check @@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq) return 1; } +static void +irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +{ + struct irqaction *action = desc->action; + + while (action) { + if (action->thread) + set_cpus_allowed_ptr(action->thread, cpumask); + action = action->next; + } +} + /** * irq_set_affinity - Set the irq affinity of a given irq * @irq: Interrupt to set affinity @@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif + irq_set_thread_affinity(desc, cpumask); desc->status |= IRQ_AFFINITY_SET; spin_unlock_irqrestore(&desc->lock, flags); return 0; @@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); + if (!ret) + irq_set_thread_affinity(desc, desc->affinity); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -401,6 +424,90 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, return ret; } +static int irq_wait_for_interrupt(struct irqaction *action) +{ + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { + __set_current_state(TASK_RUNNING); + return 0; + } + schedule(); + } + return -1; +} + +/* + * Interrupt handler thread + */ +static int irq_thread(void *data) +{ + struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; + struct irqaction *action = data; + struct irq_desc *desc = irq_to_desc(action->irq); + int wake; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->irqaction = action; + + while (!irq_wait_for_interrupt(action)) { + + atomic_inc(&desc->threads_active); + + spin_lock_irq(&desc->lock); + if (unlikely(desc->status & IRQ_DISABLED)) { + /* + * CHECKME: We might need a dedicated + * IRQ_THREAD_PENDING flag here, which + * retriggers the thread in check_irq_resend() + * but AFAICT IRQ_PENDING should be fine as it + * retriggers the interrupt itself --- tglx + */ + desc->status |= IRQ_PENDING; + spin_unlock_irq(&desc->lock); + } else { + spin_unlock_irq(&desc->lock); + + action->thread_fn(action->irq, action->dev_id); + } + + wake = atomic_dec_and_test(&desc->threads_active); + + if (wake && waitqueue_active(&desc->wait_for_threads)) + wake_up(&desc->wait_for_threads); + } + + /* + * Clear irqaction. Otherwise exit_irq_thread() would make + * fuzz about an active irq thread going into nirvana. + */ + current->irqaction = NULL; + return 0; +} + +/* + * Called from do_exit() + */ +void exit_irq_thread(void) +{ + struct task_struct *tsk = current; + + if (!tsk->irqaction) + return; + + printk(KERN_ERR + "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + + /* + * Set the THREAD DIED flag to prevent further wakeups of the + * soon to be gone threaded handler. + */ + set_bit(IRQTF_DIED, &tsk->irqaction->flags); +} + /* * Internal function to register an irqaction - typically used to * allocate special interrupts that are part of the architecture. @@ -437,6 +544,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) } /* + * Threaded handler ? + */ + if (new->thread_fn) { + struct task_struct *t; + + t = kthread_create(irq_thread, new, "irq/%d-%s", irq, + new->name); + if (IS_ERR(t)) + return PTR_ERR(t); + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code + * references an already freed task_struct. + */ + get_task_struct(t); + new->thread = t; + wake_up_process(t); + } + + /* * The following block of code has to be executed atomically */ spin_lock_irqsave(&desc->lock, flags); @@ -473,15 +600,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (!shared) { irq_chip_set_defaults(desc->chip); + init_waitqueue_head(&desc->wait_for_threads); + /* Setup the type (level, edge polarity) if configured: */ if (new->flags & IRQF_TRIGGER_MASK) { ret = __irq_set_trigger(desc, irq, new->flags & IRQF_TRIGGER_MASK); - if (ret) { - spin_unlock_irqrestore(&desc->lock, flags); - return ret; - } + if (ret) + goto out_thread; } else compat_irq_chip_set_default_handler(desc); #if defined(CONFIG_IRQ_PER_CPU) @@ -549,8 +676,19 @@ mismatch: dump_stack(); } #endif + ret = -EBUSY; + +out_thread: spin_unlock_irqrestore(&desc->lock, flags); - return -EBUSY; + if (new->thread) { + struct task_struct *t = new->thread; + + new->thread = NULL; + if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) + kthread_stop(t); + put_task_struct(t); + } + return ret; } /** @@ -576,6 +714,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action, **action_ptr; + struct task_struct *irqthread; unsigned long flags; WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); @@ -622,6 +761,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) else desc->chip->disable(irq); } + + irqthread = action->thread; + action->thread = NULL; + spin_unlock_irqrestore(&desc->lock, flags); unregister_handler_proc(irq, action); @@ -629,6 +772,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU: */ synchronize_irq(irq); + if (irqthread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(irqthread); + put_task_struct(irqthread); + } + #ifdef CONFIG_DEBUG_SHIRQ /* * It's a shared IRQ -- the driver ought to be prepared for an IRQ @@ -681,9 +830,12 @@ void free_irq(unsigned int irq, void *dev_id) EXPORT_SYMBOL(free_irq); /** - * request_irq - allocate an interrupt line + * request_threaded_irq - allocate an interrupt line * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created * @irqflags: Interrupt type flags * @devname: An ascii name for the claiming device * @dev_id: A cookie passed back to the handler function @@ -695,6 +847,15 @@ EXPORT_SYMBOL(free_irq); * raises, you must take care both to initialise your hardware * and to set up the interrupt handler in the right order. * + * If you want to set up a threaded irq handler for your device + * then you need to supply @handler and @thread_fn. @handler ist + * still called in hard interrupt context and has to check + * whether the interrupt originates from the device. If yes it + * needs to disable the interrupt on the device and return + * IRQ_THREAD_WAKE which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support + * shared interrupts. + * * Dev_id must be globally unique. Normally the address of the * device data structure is used as the cookie. Since the handler * receives this value it makes sense to use it. @@ -710,8 +871,9 @@ EXPORT_SYMBOL(free_irq); * IRQF_TRIGGER_* Specify active edge(s) or level * */ -int request_irq(unsigned int irq, irq_handler_t handler, - unsigned long irqflags, const char *devname, void *dev_id) +int request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, unsigned long irqflags, + const char *devname, void *dev_id) { struct irqaction *action; struct irq_desc *desc; @@ -759,6 +921,7 @@ int request_irq(unsigned int irq, irq_handler_t handler, return -ENOMEM; action->handler = handler; + action->thread_fn = thread_fn; action->flags = irqflags; action->name = devname; action->dev_id = dev_id; @@ -788,4 +951,4 @@ int request_irq(unsigned int irq, irq_handler_t handler, #endif return retval; } -EXPORT_SYMBOL(request_irq); +EXPORT_SYMBOL(request_threaded_irq); diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 243d6121e50e..44bbdcbaf8d2 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) { free_kstat_irqs(old_desc, desc); + free_desc_masks(old_desc, desc); arch_free_chip_data(old_desc, desc); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5016bfb682b9..a5e74ddee0e2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -68,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ -static bool kprobe_enabled; +static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; @@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->pre_handler && !kprobe_gone(kp)) { + if (kp->pre_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); if (kp->pre_handler(kp, regs)) return 1; @@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, struct kprobe *kp; list_for_each_entry_rcu(kp, &p->list, list) { - if (kp->post_handler && !kprobe_gone(kp)) { + if (kp->post_handler && likely(!kprobe_disabled(kp))) { set_kprobe_instance(kp); kp->post_handler(kp, regs, flags); reset_kprobe_instance(); @@ -518,20 +518,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) } /* -* Add the new probe to old_p->list. Fail if this is the +* Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) { + BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); if (p->break_handler) { - if (old_p->break_handler) + if (ap->break_handler) return -EEXIST; - list_add_tail_rcu(&p->list, &old_p->list); - old_p->break_handler = aggr_break_handler; + list_add_tail_rcu(&p->list, &ap->list); + ap->break_handler = aggr_break_handler; } else - list_add_rcu(&p->list, &old_p->list); - if (p->post_handler && !old_p->post_handler) - old_p->post_handler = aggr_post_handler; + list_add_rcu(&p->list, &ap->list); + if (p->post_handler && !ap->post_handler) + ap->post_handler = aggr_post_handler; + + if (kprobe_disabled(ap) && !kprobe_disabled(p)) { + ap->flags &= ~KPROBE_FLAG_DISABLED; + if (!kprobes_all_disarmed) + /* Arm the breakpoint again. */ + arch_arm_kprobe(ap); + } return 0; } @@ -544,6 +552,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; + ap->flags = p->flags; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; /* We don't care the kprobe which has gone. */ @@ -566,44 +575,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) { int ret = 0; - struct kprobe *ap; + struct kprobe *ap = old_p; - if (kprobe_gone(old_p)) { + if (old_p->pre_handler != aggr_pre_handler) { + /* If old_p is not an aggr_probe, create new aggr_kprobe. */ + ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!ap) + return -ENOMEM; + add_aggr_kprobe(ap, old_p); + } + + if (kprobe_gone(ap)) { /* * Attempting to insert new probe at the same location that * had a probe in the module vaddr area which already * freed. So, the instruction slot has already been * released. We need a new slot for the new probe. */ - ret = arch_prepare_kprobe(old_p); + ret = arch_prepare_kprobe(ap); if (ret) + /* + * Even if fail to allocate new slot, don't need to + * free aggr_probe. It will be used next time, or + * freed by unregister_kprobe. + */ return ret; - } - if (old_p->pre_handler == aggr_pre_handler) { - copy_kprobe(old_p, p); - ret = add_new_kprobe(old_p, p); - ap = old_p; - } else { - ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); - if (!ap) { - if (kprobe_gone(old_p)) - arch_remove_kprobe(old_p); - return -ENOMEM; - } - add_aggr_kprobe(ap, old_p); - copy_kprobe(ap, p); - ret = add_new_kprobe(ap, p); - } - if (kprobe_gone(old_p)) { + /* - * If the old_p has gone, its breakpoint has been disarmed. - * We have to arm it again after preparing real kprobes. + * Clear gone flag to prevent allocating new slot again, and + * set disabled flag because it is not armed yet. */ - ap->flags &= ~KPROBE_FLAG_GONE; - if (kprobe_enabled) - arch_arm_kprobe(ap); + ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) + | KPROBE_FLAG_DISABLED; } - return ret; + + copy_kprobe(ap, p); + return add_new_kprobe(ap, p); +} + +/* Try to disable aggr_kprobe, and return 1 if succeeded.*/ +static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (!kprobe_disabled(kp)) + /* + * There is an active probe on the list. + * We can't disable aggr_kprobe. + */ + return 0; + } + p->flags |= KPROBE_FLAG_DISABLED; + return 1; } static int __kprobes in_kprobes_functions(unsigned long addr) @@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p) return -EINVAL; } - p->flags = 0; + /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ + p->flags &= KPROBE_FLAG_DISABLED; + /* * Check if are we probing a module. */ @@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p) hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (kprobe_enabled) + if (!kprobes_all_disarmed && !kprobe_disabled(p)) arch_arm_kprobe(p); out_unlock_text: @@ -722,26 +748,39 @@ out: return ret; } +EXPORT_SYMBOL_GPL(register_kprobe); -/* - * Unregister a kprobe without a scheduler synchronization. - */ -static int __kprobes __unregister_kprobe_top(struct kprobe *p) +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) { struct kprobe *old_p, *list_p; old_p = get_kprobe(p->addr); if (unlikely(!old_p)) - return -EINVAL; + return NULL; if (p != old_p) { list_for_each_entry_rcu(list_p, &old_p->list, list) if (list_p == p) /* kprobe p is a valid probe */ - goto valid_p; - return -EINVAL; + goto valid; + return NULL; } -valid_p: +valid: + return old_p; +} + +/* + * Unregister a kprobe without a scheduler synchronization. + */ +static int __kprobes __unregister_kprobe_top(struct kprobe *p) +{ + struct kprobe *old_p, *list_p; + + old_p = __get_valid_kprobe(p); + if (old_p == NULL) + return -EINVAL; + if (old_p == p || (old_p->pre_handler == aggr_pre_handler && list_is_singular(&old_p->list))) { @@ -750,7 +789,7 @@ valid_p: * enabled and not gone - otherwise, the breakpoint would * already have been removed. We save on flushing icache. */ - if (kprobe_enabled && !kprobe_gone(old_p)) { + if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) { mutex_lock(&text_mutex); arch_disarm_kprobe(p); mutex_unlock(&text_mutex); @@ -768,6 +807,11 @@ valid_p: } noclean: list_del_rcu(&p->list); + if (!kprobe_disabled(old_p)) { + try_to_disable_aggr_kprobe(old_p); + if (!kprobes_all_disarmed && kprobe_disabled(old_p)) + arch_disarm_kprobe(old_p); + } } return 0; } @@ -803,11 +847,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num) } return ret; } +EXPORT_SYMBOL_GPL(register_kprobes); void __kprobes unregister_kprobe(struct kprobe *p) { unregister_kprobes(&p, 1); } +EXPORT_SYMBOL_GPL(unregister_kprobe); void __kprobes unregister_kprobes(struct kprobe **kps, int num) { @@ -826,6 +872,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num) if (kps[i]->addr) __unregister_kprobe_bottom(kps[i]); } +EXPORT_SYMBOL_GPL(unregister_kprobes); static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, @@ -865,16 +912,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) } return ret; } +EXPORT_SYMBOL_GPL(register_jprobes); int __kprobes register_jprobe(struct jprobe *jp) { return register_jprobes(&jp, 1); } +EXPORT_SYMBOL_GPL(register_jprobe); void __kprobes unregister_jprobe(struct jprobe *jp) { unregister_jprobes(&jp, 1); } +EXPORT_SYMBOL_GPL(unregister_jprobe); void __kprobes unregister_jprobes(struct jprobe **jps, int num) { @@ -894,6 +944,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num) __unregister_kprobe_bottom(&jps[i]->kp); } } +EXPORT_SYMBOL_GPL(unregister_jprobes); #ifdef CONFIG_KRETPROBES /* @@ -987,6 +1038,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) free_rp_inst(rp); return ret; } +EXPORT_SYMBOL_GPL(register_kretprobe); int __kprobes register_kretprobes(struct kretprobe **rps, int num) { @@ -1004,11 +1056,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num) } return ret; } +EXPORT_SYMBOL_GPL(register_kretprobes); void __kprobes unregister_kretprobe(struct kretprobe *rp) { unregister_kretprobes(&rp, 1); } +EXPORT_SYMBOL_GPL(unregister_kretprobe); void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { @@ -1030,24 +1084,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) } } } +EXPORT_SYMBOL_GPL(unregister_kretprobes); #else /* CONFIG_KRETPROBES */ int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } +EXPORT_SYMBOL_GPL(register_kretprobe); int __kprobes register_kretprobes(struct kretprobe **rps, int num) { return -ENOSYS; } +EXPORT_SYMBOL_GPL(register_kretprobes); + void __kprobes unregister_kretprobe(struct kretprobe *rp) { } +EXPORT_SYMBOL_GPL(unregister_kretprobe); void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) { } +EXPORT_SYMBOL_GPL(unregister_kretprobes); static int __kprobes pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) @@ -1061,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, static void __kprobes kill_kprobe(struct kprobe *p) { struct kprobe *kp; + p->flags |= KPROBE_FLAG_GONE; if (p->pre_handler == aggr_pre_handler) { /* @@ -1173,8 +1234,8 @@ static int __init init_kprobes(void) } } - /* By default, kprobes are enabled */ - kprobe_enabled = true; + /* By default, kprobes are armed */ + kprobes_all_disarmed = false; err = arch_init_kprobes(); if (!err) @@ -1202,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, else kprobe_type = "k"; if (sym) - seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, - sym, offset, (modname ? modname : " "), - (kprobe_gone(p) ? "[GONE]" : "")); + seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", + p->addr, kprobe_type, sym, offset, + (modname ? modname : " "), + (kprobe_gone(p) ? "[GONE]" : ""), + ((kprobe_disabled(p) && !kprobe_gone(p)) ? + "[DISABLED]" : "")); else - seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, - (kprobe_gone(p) ? "[GONE]" : "")); + seq_printf(pi, "%p %s %p %s%s\n", + p->addr, kprobe_type, p->addr, + (kprobe_gone(p) ? "[GONE]" : ""), + ((kprobe_disabled(p) && !kprobe_gone(p)) ? + "[DISABLED]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1272,7 +1339,72 @@ static struct file_operations debugfs_kprobes_operations = { .release = seq_release, }; -static void __kprobes enable_all_kprobes(void) +/* Disable one kprobe */ +int __kprobes disable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + /* If the probe is already disabled (or gone), just return */ + if (kprobe_disabled(kp)) + goto out; + + kp->flags |= KPROBE_FLAG_DISABLED; + if (p != kp) + /* When kp != p, p is always enabled. */ + try_to_disable_aggr_kprobe(p); + + if (!kprobes_all_disarmed && kprobe_disabled(p)) + arch_disarm_kprobe(p); +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(disable_kprobe); + +/* Enable one kprobe */ +int __kprobes enable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + if (kprobe_gone(kp)) { + /* This kprobe has gone, we couldn't enable it. */ + ret = -EINVAL; + goto out; + } + + if (!kprobes_all_disarmed && kprobe_disabled(p)) + arch_arm_kprobe(p); + + p->flags &= ~KPROBE_FLAG_DISABLED; + if (p != kp) + kp->flags &= ~KPROBE_FLAG_DISABLED; +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(enable_kprobe); + +static void __kprobes arm_all_kprobes(void) { struct hlist_head *head; struct hlist_node *node; @@ -1281,20 +1413,20 @@ static void __kprobes enable_all_kprobes(void) mutex_lock(&kprobe_mutex); - /* If kprobes are already enabled, just return */ - if (kprobe_enabled) + /* If kprobes are armed, just return */ + if (!kprobes_all_disarmed) goto already_enabled; mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) - if (!kprobe_gone(p)) + if (!kprobe_disabled(p)) arch_arm_kprobe(p); } mutex_unlock(&text_mutex); - kprobe_enabled = true; + kprobes_all_disarmed = false; printk(KERN_INFO "Kprobes globally enabled\n"); already_enabled: @@ -1302,7 +1434,7 @@ already_enabled: return; } -static void __kprobes disable_all_kprobes(void) +static void __kprobes disarm_all_kprobes(void) { struct hlist_head *head; struct hlist_node *node; @@ -1311,17 +1443,17 @@ static void __kprobes disable_all_kprobes(void) mutex_lock(&kprobe_mutex); - /* If kprobes are already disabled, just return */ - if (!kprobe_enabled) + /* If kprobes are already disarmed, just return */ + if (kprobes_all_disarmed) goto already_disabled; - kprobe_enabled = false; + kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) arch_disarm_kprobe(p); } } @@ -1347,7 +1479,7 @@ static ssize_t read_enabled_file_bool(struct file *file, { char buf[3]; - if (kprobe_enabled) + if (!kprobes_all_disarmed) buf[0] = '1'; else buf[0] = '0'; @@ -1370,12 +1502,12 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'y': case 'Y': case '1': - enable_all_kprobes(); + arm_all_kprobes(); break; case 'n': case 'N': case '0': - disable_all_kprobes(); + disarm_all_kprobes(); break; } @@ -1418,16 +1550,5 @@ late_initcall(debugfs_kprobe_init); module_init(init_kprobes); -EXPORT_SYMBOL_GPL(register_kprobe); -EXPORT_SYMBOL_GPL(unregister_kprobe); -EXPORT_SYMBOL_GPL(register_kprobes); -EXPORT_SYMBOL_GPL(unregister_kprobes); -EXPORT_SYMBOL_GPL(register_jprobe); -EXPORT_SYMBOL_GPL(unregister_jprobe); -EXPORT_SYMBOL_GPL(register_jprobes); -EXPORT_SYMBOL_GPL(unregister_jprobes); +/* defined in arch/.../kernel/kprobes.c */ EXPORT_SYMBOL_GPL(jprobe_return); -EXPORT_SYMBOL_GPL(register_kretprobe); -EXPORT_SYMBOL_GPL(unregister_kretprobe); -EXPORT_SYMBOL_GPL(register_kretprobes); -EXPORT_SYMBOL_GPL(unregister_kretprobes); diff --git a/kernel/kthread.c b/kernel/kthread.c index 84bbadd4d021..4ebaf8519abf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -76,6 +76,7 @@ static int kthread(void *_create) /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; complete(&create->started); schedule(); @@ -96,22 +97,10 @@ static void create_kthread(struct kthread_create_info *create) /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); - if (pid < 0) { + if (pid < 0) create->result = ERR_PTR(pid); - } else { - struct sched_param param = { .sched_priority = 0 }; + else wait_for_completion(&create->started); - read_lock(&tasklist_lock); - create->result = find_task_by_pid_ns(pid, &init_pid_ns); - read_unlock(&tasklist_lock); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler(create->result, SCHED_NORMAL, ¶m); - set_user_nice(create->result, KTHREAD_NICE_LEVEL); - set_cpus_allowed_ptr(create->result, cpu_all_mask); - } complete(&create->done); } @@ -154,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), wait_for_completion(&create.done); if (!IS_ERR(create.result)) { + struct sched_param param = { .sched_priority = 0 }; va_list args; + va_start(args, namefmt); vsnprintf(create.result->comm, sizeof(create.result->comm), namefmt, args); va_end(args); + /* + * root may have changed our (kthreadd's) priority or CPU mask. + * The kernel thread should not inherit these properties. + */ + sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); + set_user_nice(create.result, KTHREAD_NICE_LEVEL); + set_cpus_allowed_ptr(create.result, cpu_all_mask); } return create.result; } diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 81b5f33970b8..b0f011866969 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -793,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); printk("turning off the locking correctness validator.\n"); + dump_stack(); return NULL; } class = lock_classes + nr_lock_classes++; @@ -856,6 +857,7 @@ static struct lock_list *alloc_list_entry(void) printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); printk("turning off the locking correctness validator.\n"); + dump_stack(); return NULL; } return list_entries + nr_list_entries++; @@ -1682,6 +1684,7 @@ cache_hit: printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); printk("turning off the locking correctness validator.\n"); + dump_stack(); return 0; } chain = lock_chains + nr_lock_chains++; @@ -2541,6 +2544,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, debug_locks_off(); printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); printk("turning off the locking correctness validator.\n"); + dump_stack(); return 0; } @@ -2637,6 +2641,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, debug_locks_off(); printk("BUG: MAX_LOCK_DEPTH too low!\n"); printk("turning off the locking correctness validator.\n"); + dump_stack(); return 0; } diff --git a/kernel/module.c b/kernel/module.c index c268a771595c..e797812a4d95 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1952,9 +1952,6 @@ static noinline struct module *load_module(void __user *umod, if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; #endif - /* Don't keep __versions around; it's just for loading. */ - if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0) - sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; } modindex = find_sec(hdr, sechdrs, secstrings, @@ -2391,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); + /* We need to finish all async code before the module init sequence is done */ + async_synchronize_full(); + mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a3..507cf2b5e9f1 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire(&lock->dep_map, subclass, 0, ip); -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ + !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) /* * Optimistic spinning. * diff --git a/kernel/panic.c b/kernel/panic.c index 3fd8c5bf8b39..934fb377f4b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -213,8 +213,16 @@ unsigned long get_taint(void) void add_taint(unsigned flag) { - /* can't trust the integrity of the kernel anymore: */ - debug_locks = 0; + /* + * Can't trust the integrity of the kernel anymore. + * We don't call directly debug_locks_off() because the issue + * is not necessarily serious enough to set oops_in_progress to 1 + * Also we want to keep up lockdep for staging development and + * post-warning case. + */ + if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) + printk(KERN_WARNING "Disabling lockdep due to kernel taint\n"); + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 8e5d9a68b022..c9dcf98b4463 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new) cputime = secs_to_cputime(rlim_new); if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_lt(current->signal->it_prof_expires, cputime)) { + cputime_gt(current->signal->it_prof_expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, cpu->cpu = virt_ticks(p); break; case CPUCLOCK_SCHED: - cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); + cpu->sched = task_sched_runtime(p); break; } return 0; @@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock, { struct task_cputime cputime; - thread_group_cputime(p, &cputime); switch (CPUCLOCK_WHICH(which_clock)) { default: return -EINVAL; case CPUCLOCK_PROF: + thread_group_cputime(p, &cputime); cpu->cpu = cputime_add(cputime.utime, cputime.stime); break; case CPUCLOCK_VIRT: + thread_group_cputime(p, &cputime); cpu->cpu = cputime.utime; break; case CPUCLOCK_SCHED: - cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + cpu->sched = thread_group_sched_runtime(p); break; } return 0; diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 5f21ab2bbcdf..0854770b63b9 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,6 +22,7 @@ #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> +#include <scsi/scsi_scan.h> #include <asm/suspend.h> #include "power.h" @@ -645,6 +646,13 @@ static int software_resume(void) return 0; /* + * We can't depend on SCSI devices being available after loading one of + * their modules if scsi_complete_async_scans() is not called and the + * resume device usually is a SCSI one. + */ + scsi_complete_async_scans(); + + /* * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate * trigger path is via sysfs which takes a buffer mutex before diff --git a/kernel/power/user.c b/kernel/power/user.c index 6c85359364f2..ed97375daae9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,6 +24,7 @@ #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/smp_lock.h> +#include <scsi/scsi_scan.h> #include <asm/uaccess.h> @@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + /* Hibernating. The image device should be accessible. */ data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; @@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); } else { + /* + * Resuming. We may need to wait for the image device to + * appear. + */ + wait_for_device_probe(); + scsi_complete_async_scans(); + data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain(PM_RESTORE_PREPARE); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aaad0ec34194..dfcd83ceee3b 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -21,9 +21,7 @@ #include <linux/audit.h> #include <linux/pid_namespace.h> #include <linux/syscalls.h> - -#include <asm/pgtable.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* @@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) list_add(&child->ptrace_entry, &new_parent->ptraced); child->parent = new_parent; } - + /* * Turn a tracing stop into a normal stop now, since with no tracer there * would be no way to wake it up with SIGCONT or SIGKILL. If there was a @@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) task_lock(task); err = __ptrace_may_access(task, mode); task_unlock(task); - return (!err ? true : false); + return !err; } int ptrace_attach(struct task_struct *task) @@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst copied += retval; src += retval; dst += retval; - len -= retval; + len -= retval; } return copied; } @@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds copied += retval; src += retval; dst += retval; - len -= retval; + len -= retval; } return copied; } @@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data) if (unlikely(!arch_has_single_step())) return -EIO; user_enable_single_step(child); - } - else + } else { user_disable_single_step(child); + } child->exit_code = data; wake_up_process(child); @@ -606,10 +604,11 @@ repeat: ret = security_ptrace_traceme(current->parent); /* - * Set the ptrace bit in the process ptrace flags. - * Then link us on our parent's ptraced list. + * Check PF_EXITING to ensure ->real_parent has not passed + * exit_ptrace(). Otherwise we don't report the error but + * pretend ->real_parent untraces us right after return. */ - if (!ret) { + if (!ret && !(current->real_parent->flags & PF_EXITING)) { current->ptrace |= PT_PTRACED; __ptrace_link(current, current->real_parent); } diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index 654c640a6b9c..0f2b0b311304 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = { .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), .cpumask = CPU_BITS_NONE, }; + static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cur = -300, .completed = -300, @@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { .cpumask = CPU_BITS_NONE, }; -DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; +static DEFINE_PER_CPU(struct rcu_data, rcu_data); +static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +void rcu_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + rdp->passed_quiesc = 1; +} + +void rcu_bh_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesc = 1; +} static int blimit = 10; static int qhimark = 10000; diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 5d59e850fb71..ce97a4df64d3 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -147,7 +147,51 @@ struct rcu_ctrlblk { wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ }; +struct rcu_dyntick_sched { + int dynticks; + int dynticks_snap; + int sched_qs; + int sched_qs_snap; + int sched_dynticks_snap; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { + .dynticks = 1, +}; + +void rcu_qsctr_inc(int cpu) +{ + struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu); + + rdssp->sched_qs++; +} + +#ifdef CONFIG_NO_HZ + +void rcu_enter_nohz(void) +{ + static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); + + smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */ + __get_cpu_var(rcu_dyntick_sched).dynticks++; + WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs); +} + +void rcu_exit_nohz(void) +{ + static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1); + + __get_cpu_var(rcu_dyntick_sched).dynticks++; + smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */ + WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1), + &rs); +} + +#endif /* CONFIG_NO_HZ */ + + static DEFINE_PER_CPU(struct rcu_data, rcu_data); + static struct rcu_ctrlblk rcu_ctrlblk = { .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), .completed = 0, @@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } -DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = { - .dynticks = 1, -}; - #ifdef CONFIG_NO_HZ static DEFINE_PER_CPU(int, rcu_update_flag); diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 97ce31579ec0..7f3266922572 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +void rcu_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} + +void rcu_bh_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} + #ifdef CONFIG_NO_HZ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = 1, diff --git a/kernel/rcutree.h b/kernel/rcutree.h new file mode 100644 index 000000000000..5e872bbf07f5 --- /dev/null +++ b/kernel/rcutree.h @@ -0,0 +1,10 @@ + +/* + * RCU implementation internal declarations: + */ +extern struct rcu_state rcu_state; +DECLARE_PER_CPU(struct rcu_data, rcu_data); + +extern struct rcu_state rcu_bh_state; +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index d6db3e837826..4ee954f6a8d5 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -43,6 +43,8 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> +#include "rcutree.h" + static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) diff --git a/kernel/sched.c b/kernel/sched.c index bec249885e17..5724508c3b66 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) spin_lock(&rt_b->rt_runtime_lock); for (;;) { + unsigned long delta; + ktime_t soft, hard; + if (hrtimer_active(&rt_b->rt_period_timer)) break; now = hrtimer_cb_get_time(&rt_b->rt_period_timer); hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); - hrtimer_start_expires(&rt_b->rt_period_timer, - HRTIMER_MODE_ABS); + + soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); + hard = hrtimer_get_expires(&rt_b->rt_period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, + HRTIMER_MODE_ABS, 0); } spin_unlock(&rt_b->rt_runtime_lock); } @@ -1146,7 +1153,8 @@ static __init void init_hrtick(void) */ static void hrtick_start(struct rq *rq, u64 delay) { - hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); + __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, + HRTIMER_MODE_REL, 0); } static inline void init_hrtick(void) @@ -1410,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, struct rq_iterator *iterator); #endif +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + #ifdef CONFIG_CGROUP_CPUACCT static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} #endif static inline void inc_cpu_load(struct rq *rq, unsigned long load) @@ -4503,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * Return any ns on the sched_clock that have not yet been banked in + * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + if (task_current(rq, p)) { + update_rq_clock(rq); + ns = rq->clock - p->se.exec_start; + if ((s64)ns < 0) + ns = 0; + } + + return ns; +} + unsigned long long task_delta_exec(struct task_struct *p) { unsigned long flags; @@ -4513,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p) u64 ns = 0; rq = task_rq_lock(p, &flags); + ns = do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); - if (task_current(rq, p)) { - u64 delta_exec; + return ns; +} - update_rq_clock(rq); - delta_exec = rq->clock - p->se.exec_start; - if ((s64)delta_exec > 0) - ns = delta_exec; - } +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, &flags); + + return ns; +} +/* + * Return sum_exec_runtime for the thread group. + * In case the task is currently running, return the sum plus current's + * pending runtime that have not been accounted yet. + * + * Note that the thread group might have other running tasks as well, + * so the return value not includes other pending runtime that other + * running tasks might have. + */ +unsigned long long thread_group_sched_runtime(struct task_struct *p) +{ + struct task_cputime totals; + unsigned long flags; + struct rq *rq; + u64 ns; + + rq = task_rq_lock(p, &flags); + thread_group_cputime(p, &totals); + ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); task_rq_unlock(rq, &flags); return ns; @@ -4551,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime, cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); + + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); /* Account for user time used */ acct_update_integrals(p); } @@ -4612,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else cpustat->system = cputime64_add(cpustat->system, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + /* Account for system time used */ acct_update_integrals(p); } @@ -7294,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); - printk(KERN_CONT " %s", str); + printk(KERN_CONT " %s (__cpu_power = %d)", str, + group->__cpu_power); group = group->next; } while (group != sd->groups); @@ -9917,6 +9991,7 @@ struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ u64 *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; @@ -9941,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create( struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; if (!ca) - return ERR_PTR(-ENOMEM); + goto out; ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) { - kfree(ca); - return ERR_PTR(-ENOMEM); - } + if (!ca->cpuusage) + goto out_free_ca; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; if (cgrp->parent) ca->parent = cgroup_ca(cgrp->parent); return &ca->css; + +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); } /* destroy an existing cpu accounting group */ @@ -9962,7 +10049,10 @@ static void cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); + int i; + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); free_percpu(ca->cpuusage); kfree(ca); } @@ -10049,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, return 0; } +static const char *cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); + } + return 0; +} + static struct cftype files[] = { { .name = "usage", @@ -10059,7 +10168,10 @@ static struct cftype files[] = { .name = "usage_percpu", .read_seq_string = cpuacct_percpu_seq_read, }, - + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, }; static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) @@ -10081,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) return; cpu = task_cpu(tsk); + + rcu_read_lock(); + ca = task_ca(tsk); for (; ca; ca = ca->parent) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } + + rcu_read_unlock(); +} + +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + percpu_counter_add(&ca->cpustat[idx], val); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); } struct cgroup_subsys cpuacct_subsys = { diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 1e00bfacf9b8..cdd3c89574cd 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -55,7 +55,7 @@ static int convert_prio(int prio) * cpupri_find - find the best (lowest-pri) CPU in the system * @cp: The cpupri context * @p: The task - * @lowest_mask: A mask to fill in with selected CPUs + * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the * current invokation. By the time the call returns, the CPUs may have in @@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) continue; - cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + if (lowest_mask) + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); return 1; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 299d012b4394..f2c66f8f9712 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - cpumask_var_t mask; - if (rq->curr->rt.nr_cpus_allowed == 1) return; - if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) - return; - if (p->rt.nr_cpus_allowed != 1 - && cpupri_find(&rq->rd->cpupri, p, mask)) - goto free; + && cpupri_find(&rq->rd->cpupri, p, NULL)) + return; - if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) - goto free; + if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + return; /* * There appears to be other cpus that can accept @@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ requeue_task_rt(rq, p, 1); resched_task(rq->curr); -free: - free_cpumask_var(mask); } #endif /* CONFIG_SMP */ diff --git a/kernel/softirq.c b/kernel/softirq.c index d105a82543d0..2fecefacdc5b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -65,7 +65,7 @@ char *softirq_to_name[NR_SOFTIRQS] = { * to the pending events, so lets the scheduler to balance * the softirq load for us. */ -static inline void wakeup_softirqd(void) +void wakeup_softirqd(void) { /* Interrupts are disabled: no need to stop preemption */ struct task_struct *tsk = __get_cpu_var(ksoftirqd); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 85d5a2455103..88796c330838 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -166,97 +166,11 @@ void softlockup_tick(void) } /* - * Have a reasonable limit on the number of tasks checked: - */ -unsigned long __read_mostly sysctl_hung_task_check_count = 1024; - -/* - * Zero means infinite timeout - no checking done: - */ -unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480; - -unsigned long __read_mostly sysctl_hung_task_warnings = 10; - -/* - * Only do the hung-tasks check on one CPU: - */ -static int check_cpu __read_mostly = -1; - -static void check_hung_task(struct task_struct *t, unsigned long now) -{ - unsigned long switch_count = t->nvcsw + t->nivcsw; - - if (t->flags & PF_FROZEN) - return; - - if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { - t->last_switch_count = switch_count; - t->last_switch_timestamp = now; - return; - } - if ((long)(now - t->last_switch_timestamp) < - sysctl_hung_task_timeout_secs) - return; - if (!sysctl_hung_task_warnings) - return; - sysctl_hung_task_warnings--; - - /* - * Ok, the task did not get scheduled for more than 2 minutes, - * complain: - */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, - sysctl_hung_task_timeout_secs); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); - sched_show_task(t); - __debug_show_held_locks(t); - - t->last_switch_timestamp = now; - touch_nmi_watchdog(); - - if (softlockup_panic) - panic("softlockup: blocked tasks"); -} - -/* - * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for - * a really long time (120 seconds). If that happens, print out - * a warning. - */ -static void check_hung_uninterruptible_tasks(int this_cpu) -{ - int max_count = sysctl_hung_task_check_count; - unsigned long now = get_timestamp(this_cpu); - struct task_struct *g, *t; - - /* - * If the system crashed already then all bets are off, - * do not report extra hung tasks: - */ - if (test_taint(TAINT_DIE) || did_panic) - return; - - read_lock(&tasklist_lock); - do_each_thread(g, t) { - if (!--max_count) - goto unlock; - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (t->state == TASK_UNINTERRUPTIBLE) - check_hung_task(t, now); - } while_each_thread(g, t); - unlock: - read_unlock(&tasklist_lock); -} - -/* * The watchdog thread - runs every second and touches the timestamp. */ static int watchdog(void *__bind_cpu) { struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; - int this_cpu = (long)__bind_cpu; sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu) if (kthread_should_stop()) break; - if (this_cpu == check_cpu) { - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); - } - set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); @@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - check_cpu = cpumask_any(cpu_online_mask); wake_up_process(per_cpu(watchdog_task, hotcpu)); break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - if (hotcpu == check_cpu) { - /* Pick any other online cpu. */ - check_cpu = cpumask_any_but(cpu_online_mask, hotcpu); - } - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: if (!per_cpu(watchdog_task, hotcpu)) diff --git a/kernel/sys.c b/kernel/sys.c index 51dbb55604e8..e7998cf31498 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -360,6 +360,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, void __user *, arg) { char buffer[256]; + int ret = 0; /* We only trust the superuser with rebooting the system. */ if (!capable(CAP_SYS_BOOT)) @@ -397,7 +398,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, kernel_halt(); unlock_kernel(); do_exit(0); - break; + panic("cannot halt"); case LINUX_REBOOT_CMD_POWER_OFF: kernel_power_off(); @@ -417,29 +418,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, #ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - { - int ret; - ret = kernel_kexec(); - unlock_kernel(); - return ret; - } + ret = kernel_kexec(); + break; #endif #ifdef CONFIG_HIBERNATION case LINUX_REBOOT_CMD_SW_SUSPEND: - { - int ret = hibernate(); - unlock_kernel(); - return ret; - } + ret = hibernate(); + break; #endif default: - unlock_kernel(); - return -EINVAL; + ret = -EINVAL; + break; } unlock_kernel(); - return 0; + return ret; } static void deferred_cad(struct work_struct *dummy) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 82350f8f04f6..e3d2c7dd59b9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -97,10 +97,11 @@ static int neg_one = -1; #endif static int zero; -static int one = 1; -static int two = 2; +static int __maybe_unused one = 1; +static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; @@ -813,6 +814,19 @@ static struct ctl_table kern_table[] = { .extra1 = &neg_one, .extra2 = &sixty, }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .ctl_name = CTL_UNNUMBERED, + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "hung_task_check_count", @@ -828,7 +842,7 @@ static struct ctl_table kern_table[] = { .data = &sysctl_hung_task_timeout_secs, .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_dohung_task_timeout_secs, .strategy = &sysctl_intvec, }, { @@ -888,16 +902,6 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif -#ifdef CONFIG_UNEVICTABLE_LRU - { - .ctl_name = CTL_UNNUMBERED, - .procname = "scan_unevictable_pages", - .data = &scan_unevictable_pages, - .maxlen = sizeof(scan_unevictable_pages), - .mode = 0644, - .proc_handler = &scan_unevictable_handler, - }, -#endif #ifdef CONFIG_SLOW_WORK { .ctl_name = CTL_UNNUMBERED, @@ -1027,6 +1031,28 @@ static struct ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_pdflush_threads_min", + .data = &nr_pdflush_threads_min, + .maxlen = sizeof nr_pdflush_threads_min, + .mode = 0644 /* read-write */, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &one, + .extra2 = &nr_pdflush_threads_max, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "nr_pdflush_threads_max", + .data = &nr_pdflush_threads_max, + .maxlen = sizeof nr_pdflush_threads_max, + .mode = 0644 /* read-write */, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &nr_pdflush_threads_min, + .extra2 = &one_thousand, + }, + { .ctl_name = VM_SWAPPINESS, .procname = "swappiness", .data = &vm_swappiness, @@ -1266,6 +1292,16 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif +#ifdef CONFIG_UNEVICTABLE_LRU + { + .ctl_name = CTL_UNNUMBERED, + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = &scan_unevictable_handler, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/timer.c b/kernel/timer.c index b4555568b4e4..cffffad01c31 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -531,10 +531,13 @@ static void __init_timer(struct timer_list *timer, } /** - * init_timer - initialize a timer. + * init_timer_key - initialize a timer * @timer: the timer to be initialized + * @name: name of the timer + * @key: lockdep class key of the fake lock used for tracking timer + * sync lock dependencies * - * init_timer() must be done to a timer prior calling *any* of the + * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2246141bda4d..417d1985e299 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -312,7 +312,7 @@ config KMEMTRACE and profile kernel code. This requires an userspace application to use. See - Documentation/vm/kmemtrace.txt for more information. + Documentation/trace/kmemtrace.txt for more information. Saying Y will make the kernel somewhat larger and slower. However, if you disable kmemtrace at run-time or boot-time, the performance @@ -403,7 +403,7 @@ config MMIOTRACE implementation and works via page faults. Tracing is disabled by default and can be enabled at run-time. - See Documentation/tracers/mmiotrace.txt. + See Documentation/trace/mmiotrace.txt. If you are not helping to develop drivers, say N. config MMIOTRACE_TEST diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 947c5b3f90c4..921ef5d1f0ba 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, char *msg; struct blk_trace *bt; - if (count > BLK_TN_MAX_MSG) + if (count >= BLK_TN_MAX_MSG) return -EINVAL; - msg = kmalloc(count, GFP_KERNEL); + msg = kmalloc(count + 1, GFP_KERNEL); if (msg == NULL) return -ENOMEM; @@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, return -EFAULT; } + msg[count] = '\0'; bt = filp->private_data; __trace_note_message(bt, "%s", msg); kfree(msg); @@ -642,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (blk_pc_request(rq)) { what |= BLK_TC_ACT(BLK_TC_PC); __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - sizeof(rq->cmd), rq->cmd); + rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, @@ -1376,12 +1377,12 @@ static int blk_trace_str2mask(const char *str) { int i; int mask = 0; - char *s, *token; + char *buf, *s, *token; - s = kstrdup(str, GFP_KERNEL); - if (s == NULL) + buf = kstrdup(str, GFP_KERNEL); + if (buf == NULL) return -ENOMEM; - s = strstrip(s); + s = strstrip(buf); while (1) { token = strsep(&s, ","); @@ -1402,7 +1403,7 @@ static int blk_trace_str2mask(const char *str) break; } } - kfree(s); + kfree(buf); return mask; } diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index ae201b3eda89..5011f4d91e37 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -6,14 +6,16 @@ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> */ -#include <linux/dcache.h> +#include <linux/tracepoint.h> +#include <linux/seq_file.h> #include <linux/debugfs.h> +#include <linux/dcache.h> #include <linux/fs.h> -#include <linux/seq_file.h> + #include <trace/kmemtrace.h> -#include "trace.h" #include "trace_output.h" +#include "trace.h" /* Select an alternative, minimalistic output than the original one */ #define TRACE_KMEM_OPT_MINIMAL 0x1 @@ -25,14 +27,156 @@ static struct tracer_opt kmem_opts[] = { }; static struct tracer_flags kmem_tracer_flags = { - .val = 0, - .opts = kmem_opts + .val = 0, + .opts = kmem_opts }; - -static bool kmem_tracing_enabled __read_mostly; static struct trace_array *kmemtrace_array; +/* Trace allocations */ +static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + struct trace_array *tr = kmemtrace_array; + struct kmemtrace_alloc_entry *entry; + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + if (!event) + return; + + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_ALLOC; + entry->type_id = type_id; + entry->call_site = call_site; + entry->ptr = ptr; + entry->bytes_req = bytes_req; + entry->bytes_alloc = bytes_alloc; + entry->gfp_flags = gfp_flags; + entry->node = node; + + ring_buffer_unlock_commit(tr->buffer, event); + + trace_wake_up(); +} + +static inline void kmemtrace_free(enum kmemtrace_type_id type_id, + unsigned long call_site, + const void *ptr) +{ + struct trace_array *tr = kmemtrace_array; + struct kmemtrace_free_entry *entry; + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry)); + if (!event) + return; + entry = ring_buffer_event_data(event); + tracing_generic_entry_update(&entry->ent, 0, 0); + + entry->ent.type = TRACE_KMEM_FREE; + entry->type_id = type_id; + entry->call_site = call_site; + entry->ptr = ptr; + + ring_buffer_unlock_commit(tr->buffer, event); + + trace_wake_up(); +} + +static void kmemtrace_kmalloc(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +static void kmemtrace_kmem_cache_alloc(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, -1); +} + +static void kmemtrace_kmalloc_node(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, node); +} + +static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node) +{ + kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr, + bytes_req, bytes_alloc, gfp_flags, node); +} + +static void kmemtrace_kfree(unsigned long call_site, const void *ptr) +{ + kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); +} + +static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) +{ + kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); +} + +static int kmemtrace_start_probes(void) +{ + int err; + + err = register_trace_kmalloc(kmemtrace_kmalloc); + if (err) + return err; + err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); + if (err) + return err; + err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); + if (err) + return err; + err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); + if (err) + return err; + err = register_trace_kfree(kmemtrace_kfree); + if (err) + return err; + err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); + + return err; +} + +static void kmemtrace_stop_probes(void) +{ + unregister_trace_kmalloc(kmemtrace_kmalloc); + unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); + unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); + unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); + unregister_trace_kfree(kmemtrace_kfree); + unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); +} + static int kmem_trace_init(struct trace_array *tr) { int cpu; @@ -41,14 +185,14 @@ static int kmem_trace_init(struct trace_array *tr) for_each_cpu_mask(cpu, cpu_possible_map) tracing_reset(tr, cpu); - kmem_tracing_enabled = true; + kmemtrace_start_probes(); return 0; } static void kmem_trace_reset(struct trace_array *tr) { - kmem_tracing_enabled = false; + kmemtrace_stop_probes(); } static void kmemtrace_headers(struct seq_file *s) @@ -66,47 +210,84 @@ static void kmemtrace_headers(struct seq_file *s) } /* - * The two following functions give the original output from kmemtrace, - * or something close to....perhaps they need some missing things + * The following functions give the original output from kmemtrace, + * plus the origin CPU, since reordering occurs in-kernel now. */ + +#define KMEMTRACE_USER_ALLOC 0 +#define KMEMTRACE_USER_FREE 1 + +struct kmemtrace_user_event { + u8 event_id; + u8 type_id; + u16 event_size; + u32 cpu; + u64 timestamp; + unsigned long call_site; + unsigned long ptr; +}; + +struct kmemtrace_user_event_alloc { + size_t bytes_req; + size_t bytes_alloc; + unsigned gfp_flags; + int node; +}; + static enum print_line_t -kmemtrace_print_alloc_original(struct trace_iterator *iter, - struct kmemtrace_alloc_entry *entry) +kmemtrace_print_alloc_user(struct trace_iterator *iter, + struct kmemtrace_alloc_entry *entry) { + struct kmemtrace_user_event_alloc *ev_alloc; struct trace_seq *s = &iter->seq; - int ret; + struct kmemtrace_user_event *ev; + + ev = trace_seq_reserve(s, sizeof(*ev)); + if (!ev) + return TRACE_TYPE_PARTIAL_LINE; - /* Taken from the old linux/kmemtrace.h */ - ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu " - "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", - entry->type_id, entry->call_site, (unsigned long) entry->ptr, - (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc, - (unsigned long) entry->gfp_flags, entry->node); + ev->event_id = KMEMTRACE_USER_ALLOC; + ev->type_id = entry->type_id; + ev->event_size = sizeof(*ev) + sizeof(*ev_alloc); + ev->cpu = iter->cpu; + ev->timestamp = iter->ts; + ev->call_site = entry->call_site; + ev->ptr = (unsigned long)entry->ptr; - if (!ret) + ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc)); + if (!ev_alloc) return TRACE_TYPE_PARTIAL_LINE; + ev_alloc->bytes_req = entry->bytes_req; + ev_alloc->bytes_alloc = entry->bytes_alloc; + ev_alloc->gfp_flags = entry->gfp_flags; + ev_alloc->node = entry->node; + return TRACE_TYPE_HANDLED; } static enum print_line_t -kmemtrace_print_free_original(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) +kmemtrace_print_free_user(struct trace_iterator *iter, + struct kmemtrace_free_entry *entry) { struct trace_seq *s = &iter->seq; - int ret; + struct kmemtrace_user_event *ev; - /* Taken from the old linux/kmemtrace.h */ - ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n", - entry->type_id, entry->call_site, (unsigned long) entry->ptr); - - if (!ret) + ev = trace_seq_reserve(s, sizeof(*ev)); + if (!ev) return TRACE_TYPE_PARTIAL_LINE; + ev->event_id = KMEMTRACE_USER_FREE; + ev->type_id = entry->type_id; + ev->event_size = sizeof(*ev); + ev->cpu = iter->cpu; + ev->timestamp = iter->ts; + ev->call_site = entry->call_site; + ev->ptr = (unsigned long)entry->ptr; + return TRACE_TYPE_HANDLED; } - /* The two other following provide a more minimalistic output */ static enum print_line_t kmemtrace_print_alloc_compress(struct trace_iterator *iter, @@ -178,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter, static enum print_line_t kmemtrace_print_free_compress(struct trace_iterator *iter, - struct kmemtrace_free_entry *entry) + struct kmemtrace_free_entry *entry) { struct trace_seq *s = &iter->seq; int ret; @@ -239,20 +420,22 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) switch (entry->type) { case TRACE_KMEM_ALLOC: { struct kmemtrace_alloc_entry *field; + trace_assign_type(field, entry); if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) return kmemtrace_print_alloc_compress(iter, field); else - return kmemtrace_print_alloc_original(iter, field); + return kmemtrace_print_alloc_user(iter, field); } case TRACE_KMEM_FREE: { struct kmemtrace_free_entry *field; + trace_assign_type(field, entry); if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) return kmemtrace_print_free_compress(iter, field); else - return kmemtrace_print_free_original(iter, field); + return kmemtrace_print_free_user(iter, field); } default: @@ -260,70 +443,13 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) } } -/* Trace allocations */ -void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node) -{ - struct ring_buffer_event *event; - struct kmemtrace_alloc_entry *entry; - struct trace_array *tr = kmemtrace_array; - - if (!kmem_tracing_enabled) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - - entry->call_site = call_site; - entry->ptr = ptr; - entry->bytes_req = bytes_req; - entry->bytes_alloc = bytes_alloc; - entry->gfp_flags = gfp_flags; - entry->node = node; - - trace_buffer_unlock_commit(tr, event, 0, 0); -} -EXPORT_SYMBOL(kmemtrace_mark_alloc_node); - -void kmemtrace_mark_free(enum kmemtrace_type_id type_id, - unsigned long call_site, - const void *ptr) -{ - struct ring_buffer_event *event; - struct kmemtrace_free_entry *entry; - struct trace_array *tr = kmemtrace_array; - - if (!kmem_tracing_enabled) - return; - - event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE, - sizeof(*entry), 0, 0); - if (!event) - return; - entry = ring_buffer_event_data(event); - entry->type_id = type_id; - entry->call_site = call_site; - entry->ptr = ptr; - - trace_buffer_unlock_commit(tr, event, 0, 0); -} -EXPORT_SYMBOL(kmemtrace_mark_free); - static struct tracer kmem_tracer __read_mostly = { - .name = "kmemtrace", - .init = kmem_trace_init, - .reset = kmem_trace_reset, - .print_line = kmemtrace_print_line, - .print_header = kmemtrace_headers, - .flags = &kmem_tracer_flags + .name = "kmemtrace", + .init = kmem_trace_init, + .reset = kmem_trace_reset, + .print_line = kmemtrace_print_line, + .print_header = kmemtrace_headers, + .flags = &kmem_tracer_flags }; void kmemtrace_init(void) @@ -335,5 +461,4 @@ static int __init init_kmem_tracer(void) { return register_tracer(&kmem_tracer); } - device_initcall(init_kmem_tracer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a0174a40c563..1ce5dc6372b8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -30,6 +30,7 @@ #include <linux/percpu.h> #include <linux/splice.h> #include <linux/kdebug.h> +#include <linux/string.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> @@ -147,8 +148,7 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); -long -ns2usecs(cycle_t nsec) +unsigned long long ns2usecs(cycle_t nsec) { nsec += 500; do_div(nsec, 1000); @@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter) return; cpumask_set_cpu(iter->cpu, iter->started); - trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); + + /* Don't print started cpu buffer for the first entry of the trace */ + if (iter->idx > 1) + trace_seq_printf(s, "##### CPU %u buffer started ####\n", + iter->cpu); } static enum print_line_t print_trace_fmt(struct trace_iterator *iter) @@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + goto fail; + + cpumask_clear(iter->started); + if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file) if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); } + free_cpumask_var(iter->started); fail: mutex_unlock(&trace_types_lock); kfree(iter->trace); @@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file) seq_release(inode, file); mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); kfree(iter->trace); kfree(iter); return 0; @@ -2358,9 +2369,9 @@ static const char readme_msg[] = "# mkdir /debug\n" "# mount -t debugfs nodev /debug\n\n" "# cat /debug/tracing/available_tracers\n" - "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" + "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" "# cat /debug/tracing/current_tracer\n" - "none\n" + "nop\n" "# echo sched_switch > /debug/tracing/current_tracer\n" "# cat /debug/tracing/current_tracer\n" "sched_switch\n" @@ -3266,19 +3277,13 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) info->tr = &global_trace; info->cpu = cpu; - info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + info->spare = NULL; /* Force reading ring buffer for first read */ info->read = (unsigned int)-1; - if (!info->spare) - goto out; filp->private_data = info; - return 0; - - out: - kfree(info); - return -ENOMEM; + return nonseekable_open(inode, filp); } static ssize_t @@ -3293,6 +3298,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (!count) return 0; + if (!info->spare) + info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + if (!info->spare) + return -ENOMEM; + /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) goto read; @@ -3331,7 +3341,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; - ring_buffer_free_read_page(info->tr->buffer, info->spare); + if (info->spare) + ring_buffer_free_read_page(info->tr->buffer, info->spare); kfree(info); return 0; @@ -3417,14 +3428,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, int size, i; size_t ret; - /* - * We can't seek on a buffer input - */ - if (unlikely(*ppos)) - return -ESPIPE; + if (*ppos & (PAGE_SIZE - 1)) { + WARN_ONCE(1, "Ftrace: previous read must page-align\n"); + return -EINVAL; + } + if (len & (PAGE_SIZE - 1)) { + WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); + if (len < PAGE_SIZE) + return -EINVAL; + len &= PAGE_MASK; + } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) { + for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3463,6 +3479,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].offset = 0; spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; + *ppos += PAGE_SIZE; } spd.nr_pages = i; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index cb0ce3fc36d3..e685ac2b2ba1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -182,6 +182,12 @@ struct trace_power { struct power_trace state_data; }; +enum kmemtrace_type_id { + KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */ + KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */ + KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */ +}; + struct kmemtrace_alloc_entry { struct trace_entry ent; enum kmemtrace_type_id type_id; @@ -596,7 +602,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace, #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); -extern long ns2usecs(cycle_t nsec); +extern unsigned long long ns2usecs(cycle_t nsec); extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64ec4d278ffb..576f4fa2af0d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) @@ -520,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } - if (filter_add_pred(call, pred)) { + err = filter_add_pred(call, pred); + if (err < 0) { filter_free_pred(pred); - return -EINVAL; + return err; } *ppos += cnt; @@ -569,6 +571,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) @@ -586,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } - if (filter_add_subsystem_pred(system, pred)) { + err = filter_add_subsystem_pred(system, pred); + if (err < 0) { filter_free_subsystem_preds(system); filter_free_pred(pred); - return -EINVAL; + return err; } *ppos += cnt; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 026be412f356..e03cbf1e38f3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call, } } - return -ENOMEM; + return -ENOSPC; } static int is_string_field(const char *type) @@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, } if (i == MAX_FILTER_PRED) - return -EINVAL; + return -ENOSPC; events_for_each(call) { int err; @@ -410,16 +410,22 @@ int filter_parse(char **pbuf, struct filter_pred *pred) } } + if (!val_str) { + pred->field_name = NULL; + return -EINVAL; + } + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!pred->field_name) return -ENOMEM; - pred->val = simple_strtoull(val_str, &tmp, 10); + pred->val = simple_strtoull(val_str, &tmp, 0); if (tmp == val_str) { pred->str_val = kstrdup(val_str, GFP_KERNEL); if (!pred->str_val) return -ENOMEM; - } + } else if (*tmp != '\0') + return -EINVAL; return 0; } diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 30743f7d4110..d363c6672c6c 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ return 0; #undef __entry -#define __entry "REC" +#define __entry REC #undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) #undef TP_fast_assign #define TP_fast_assign(args...) args diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 4d9952d3df50..07a22c33ebf3 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -40,7 +40,7 @@ #undef TRACE_FIELD_ZERO_CHAR #define TRACE_FIELD_ZERO_CHAR(item) \ - ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \ + ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \ "offset:%u;\tsize:0;\n", \ (unsigned int)offsetof(typeof(field), item)); \ if (!ret) \ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index d72b9a63b247..64b54a59c55b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter) trace_find_cmdline(entry->pid, comm); - ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" + ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" " %ld.%03ldms (+%ld.%03ldms): ", comm, entry->pid, iter->cpu, entry->flags, entry->preempt_count, iter->idx, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index de35f200abd3..9117cea6f1ae 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) pc = preempt_count(); tracing_record_cmdline(current); + if (sched_stopped) + return; + local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 3c5ad6b2ec84..5bc00e8f153e 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); /* @@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success) data = wakeup_trace->data[wakeup_cpu]; data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + + /* + * We must be careful in using CALLER_ADDR2. But since wake_up + * is not called by an assembly function (where as schedule is) + * it should be safe to use it here. + */ trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index a2a3af29c943..5e579645ac86 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -1,5 +1,5 @@ +#include <trace/syscall.h> #include <linux/kernel.h> -#include <linux/ftrace.h> #include <asm/syscall.h> #include "trace_output.h" diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b6b966ce1451..f71fb2a08950 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -966,20 +966,20 @@ undo: } #ifdef CONFIG_SMP -static struct workqueue_struct *work_on_cpu_wq __read_mostly; struct work_for_cpu { - struct work_struct work; + struct completion completion; long (*fn)(void *); void *arg; long ret; }; -static void do_work_for_cpu(struct work_struct *w) +static int do_work_for_cpu(void *_wfc) { - struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); - + struct work_for_cpu *wfc = _wfc; wfc->ret = wfc->fn(wfc->arg); + complete(&wfc->completion); + return 0; } /** @@ -990,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w) * * This will return the value @fn returns. * It is up to the caller to ensure that the cpu doesn't go offline. + * The caller must not hold any locks which would prevent @fn from completing. */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { - struct work_for_cpu wfc; - - INIT_WORK(&wfc.work, do_work_for_cpu); - wfc.fn = fn; - wfc.arg = arg; - queue_work_on(cpu, work_on_cpu_wq, &wfc.work); - flush_work(&wfc.work); - + struct task_struct *sub_thread; + struct work_for_cpu wfc = { + .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), + .fn = fn, + .arg = arg, + }; + + sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); + if (IS_ERR(sub_thread)) + return PTR_ERR(sub_thread); + kthread_bind(sub_thread, cpu); + wake_up_process(sub_thread); + wait_for_completion(&wfc.completion); return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); @@ -1016,8 +1022,4 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); -#ifdef CONFIG_SMP - work_on_cpu_wq = create_workqueue("work_on_cpu"); - BUG_ON(!work_on_cpu_wq); -#endif } |