diff options
Diffstat (limited to 'kernel')
66 files changed, 3381 insertions, 1488 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index eca595e2fd52..2da48d3515eb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ - notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + notifier.o ksysfs.o sched_clock.o cred.o \ async.o range.o obj-y += groups.o diff --git a/kernel/async.c b/kernel/async.c index d5fe7af0de2e..4c2843c0043e 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work) struct async_entry *entry = container_of(work, struct async_entry, work); unsigned long flags; - ktime_t calltime, delta, rettime; + ktime_t uninitialized_var(calltime), delta, rettime; /* 1) move self to the running queue */ spin_lock_irqsave(&async_lock, flags); @@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain); void async_synchronize_cookie_domain(async_cookie_t cookie, struct list_head *running) { - ktime_t starttime, delta, endtime; + ktime_t uninitialized_var(starttime), delta, endtime; if (initcall_debug && system_state == SYSTEM_BOOTING) { printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1d2b6ceea95d..453100a4159d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list) /* the list of cgroups eligible for automatic release. Protected by * release_list_lock */ static LIST_HEAD(release_list); -static DEFINE_SPINLOCK(release_list_lock); +static DEFINE_RAW_SPINLOCK(release_list_lock); static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); @@ -4014,11 +4014,11 @@ again: finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); cgroup_lock_hierarchy(cgrp->root); /* delete this cgroup from parent->children */ @@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp) * already queued for a userspace notification, queue * it now */ int need_schedule_work = 0; - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); if (!cgroup_is_removed(cgrp) && list_empty(&cgrp->release_list)) { list_add(&cgrp->release_list, &release_list); need_schedule_work = 1; } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); if (need_schedule_work) schedule_work(&release_agent_work); } @@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); mutex_lock(&cgroup_mutex); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); while (!list_empty(&release_list)) { char *argv[3], *envp[3]; int i; @@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work) struct cgroup, release_list); list_del_init(&cgrp->release_list); - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!pathbuf) goto continue_free; @@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work) continue_free: kfree(pathbuf); kfree(agentbuf); - spin_lock(&release_list_lock); + raw_spin_lock(&release_list_lock); } - spin_unlock(&release_list_lock); + raw_spin_unlock(&release_list_lock); mutex_unlock(&cgroup_mutex); } diff --git a/kernel/cred.c b/kernel/cred.c index 8ef31f53c44c..bb55d052d858 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -644,6 +644,9 @@ void __init cred_init(void) */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif const struct cred *old; struct cred *new; @@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (!new) return NULL; +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + kmem_cache_free(cred_jar, new); + return NULL; + } +#endif + kdebug("prepare_kernel_cred() alloc %p", new); if (daemon) @@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) get_group_info(new->group_info); #ifdef CONFIG_KEYS - atomic_inc(&init_tgcred.usage); - new->tgcred = &init_tgcred; + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = NULL; + new->tgcred = tgcred; new->request_key_auth = NULL; new->thread_keyring = NULL; new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; diff --git a/kernel/events/core.c b/kernel/events/core.c index 0f857782d06f..d1a1bee35228 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -29,6 +29,7 @@ #include <linux/hardirq.h> #include <linux/rculist.h> #include <linux/uaccess.h> +#include <linux/suspend.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> @@ -5758,6 +5759,7 @@ struct pmu *perf_init_event(struct perf_event *event) pmu = idr_find(&pmu_idr, event->attr.type); rcu_read_unlock(); if (pmu) { + event->pmu = pmu; ret = pmu->event_init(event); if (ret) pmu = ERR_PTR(ret); @@ -5765,6 +5767,7 @@ struct pmu *perf_init_event(struct perf_event *event) } list_for_each_entry_rcu(pmu, &pmus, entry) { + event->pmu = pmu; ret = pmu->event_init(event); if (!ret) goto unlock; @@ -5891,8 +5894,6 @@ done: return ERR_PTR(err); } - event->pmu = pmu; - if (!event->parent) { if (event->attach_state & PERF_ATTACH_TASK) jump_label_inc(&perf_sched_events); @@ -6852,7 +6853,7 @@ static void __cpuinit perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -6941,7 +6942,14 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { unsigned int cpu = (long)hcpu; - switch (action & ~CPU_TASKS_FROZEN) { + /* + * Ignore suspend/resume action, the perf_pm_notifier will + * take care of that. + */ + if (action & CPU_TASKS_FROZEN) + return NOTIFY_OK; + + switch (action) { case CPU_UP_PREPARE: case CPU_DOWN_FAILED: @@ -6960,6 +6968,90 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } +static void perf_pm_resume_cpu(void *unused) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + ctx = cpuctx->task_ctx; + + perf_ctx_lock(cpuctx, ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + if (ctx) + ctx_sched_out(ctx, cpuctx, EVENT_ALL); + + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, ctx); + } + srcu_read_unlock(&pmus_srcu, idx); +} + +static void perf_pm_suspend_cpu(void *unused) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + ctx = cpuctx->task_ctx; + + perf_ctx_lock(cpuctx, ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + + perf_event_sched_in(cpuctx, ctx, current); + + perf_pmu_enable(cpuctx->ctx.pmu); + perf_ctx_unlock(cpuctx, ctx); + } + srcu_read_unlock(&pmus_srcu, idx); +} + +static int perf_resume(void) +{ + get_online_cpus(); + smp_call_function(perf_pm_resume_cpu, NULL, 1); + put_online_cpus(); + + return NOTIFY_OK; +} + +static int perf_suspend(void) +{ + get_online_cpus(); + smp_call_function(perf_pm_suspend_cpu, NULL, 1); + put_online_cpus(); + + return NOTIFY_OK; +} + +static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr) +{ + switch (action) { + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + return perf_resume(); + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + return perf_suspend(); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block perf_pm_notifier = { + .notifier_call = perf_pm, +}; + void __init perf_event_init(void) { int ret; @@ -6974,6 +7066,7 @@ void __init perf_event_init(void) perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); register_reboot_notifier(&perf_reboot_notifier); + register_pm_notifier(&perf_pm_notifier); ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); diff --git a/kernel/freezer.c b/kernel/freezer.c index 7b01de98bb6a..66a594e8ad2f 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p) unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, 0); + signal_wake_up(p, 1); spin_unlock_irqrestore(&p->sighand->siglock, flags); } diff --git a/kernel/futex.c b/kernel/futex.c index 11cbe052b2e8..1511dff0cfd6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) { struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; - u32 curval, newval; + u32 uninitialized_var(curval), newval; if (!pi_state) return -EINVAL; @@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) static int unlock_futex_pi(u32 __user *uaddr, u32 uval) { - u32 oldval; + u32 uninitialized_var(oldval); /* * There is no waiter, so we unlock the futex. The owner died @@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; struct task_struct *oldowner = pi_state->owner; - u32 uval, curval, newval; + u32 uval, uninitialized_var(curval), newval; int ret; /* Owner died? */ @@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * * Returns: * 0 - uaddr contains val and hb has been locked - * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) @@ -2481,7 +2481,7 @@ err_unlock: */ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval, mval; + u32 uval, uninitialized_var(nval), mval; retry: if (get_user(uval, uaddr)) diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index dc5114b4c16c..f7c543a801d9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -26,7 +26,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; @@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip); int irq_set_irq_type(unsigned int irq, unsigned int type) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) @@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type); int irq_set_handler_data(unsigned int irq, void *data) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; @@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data); int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; @@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) int irq_set_chip_data(unsigned int irq, void *data) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; @@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc) } } +void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu) +{ + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + cpumask_set_cpu(cpu, desc->percpu_enabled); +} + +void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu) +{ + if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); + cpumask_clear_cpu(cpu, desc->percpu_enabled); +} + static inline void mask_ack_irq(struct irq_desc *desc) { if (desc->irq_data.chip->irq_mask_ack) @@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } +/** + * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Per CPU interrupts on SMP machines without locking requirements. Same as + * handle_percpu_irq() above but with the following extras: + * + * action->percpu_dev_id is a pointer to percpu variables which + * contain the real device id for the cpu on which this handler is + * called + */ +void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct irqaction *action = desc->action; + void *dev_id = __this_cpu_ptr(action->percpu_dev_id); + irqreturn_t res; + + kstat_incr_irqs_this_cpu(irq, desc); + + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, dev_id); + trace_irq_handler_exit(irq, action, res); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} + void __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, const char *name) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0); if (!desc) return; @@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 6546431447d7..a73dd6c7372d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc); extern void irq_shutdown(struct irq_desc *desc); extern void irq_enable(struct irq_desc *desc); extern void irq_disable(struct irq_desc *desc); +extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); +extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu); extern void mask_irq(struct irq_desc *desc); extern void unmask_irq(struct irq_desc *desc); @@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); } +#define _IRQ_DESC_CHECK (1 << 0) +#define _IRQ_DESC_PERCPU (1 << 1) + +#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) +#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) + struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check); void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); static inline struct irq_desc * -irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check) { - return __irq_get_desc_lock(irq, flags, true); + return __irq_get_desc_lock(irq, flags, true, check); } static inline void @@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) } static inline struct irq_desc * -irq_get_desc_lock(unsigned int irq, unsigned long *flags) +irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check) { - return __irq_get_desc_lock(irq, flags, false); + return __irq_get_desc_lock(irq, flags, false, check); } static inline void diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 039b889ea053..1550e8447a16 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset) } struct irq_desc * -__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, + unsigned int check) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { + if (check & _IRQ_DESC_CHECK) { + if ((check & _IRQ_DESC_PERCPU) && + !irq_settings_is_per_cpu_devid(desc)) + return NULL; + + if (!(check & _IRQ_DESC_PERCPU) && + irq_settings_is_per_cpu_devid(desc)) + return NULL; + } + if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); @@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) chip_bus_sync_unlock(desc); } +int irq_set_percpu_devid(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return -EINVAL; + + if (desc->percpu_enabled) + return -EINVAL; + + desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); + + if (!desc->percpu_enabled) + return -ENOMEM; + + irq_set_percpu_devid_flags(irq); + return 0; +} + /** * dynamic_irq_cleanup - cleanup a dynamically allocated irq * @irq: irq number to initialize diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 9b956fa20308..67ce837ae52c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; @@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) static int __disable_irq_nosync(unsigned int irq) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return -EINVAL; @@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) void enable_irq(unsigned int irq) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); if (!desc) return; @@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) struct irq_desc *desc = irq_to_desc(irq); int ret = -ENXIO; + if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE) + return 0; + if (desc->irq_data.chip->irq_set_wake) ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); @@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on) int irq_set_irq_wake(unsigned int irq, unsigned int on) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); int ret = 0; if (!desc) @@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake); int can_request_irq(unsigned int irq, unsigned long irqflags) { unsigned long flags; - struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); int canrequest = 0; if (!desc) @@ -1118,6 +1121,8 @@ int setup_irq(unsigned int irq, struct irqaction *act) int retval; struct irq_desc *desc = irq_to_desc(irq); + if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) + return -EINVAL; chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); @@ -1126,7 +1131,7 @@ int setup_irq(unsigned int irq, struct irqaction *act) } EXPORT_SYMBOL_GPL(setup_irq); - /* +/* * Internal function to unregister an irqaction - used to free * regular and special interrupts that are part of the architecture. */ @@ -1224,7 +1229,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) */ void remove_irq(unsigned int irq, struct irqaction *act) { - __free_irq(irq, act->dev_id); + struct irq_desc *desc = irq_to_desc(irq); + + if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) + __free_irq(irq, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1246,7 +1254,7 @@ void free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); - if (!desc) + if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) return; #ifdef CONFIG_SMP @@ -1324,7 +1332,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, if (!desc) return -EINVAL; - if (!irq_settings_can_request(desc)) + if (!irq_settings_can_request(desc) || + WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; if (!handler) { @@ -1409,3 +1418,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler, return !ret ? IRQC_IS_HARDIRQ : ret; } EXPORT_SYMBOL_GPL(request_any_context_irq); + +void enable_percpu_irq(unsigned int irq, unsigned int type) +{ + unsigned int cpu = smp_processor_id(); + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + + if (!desc) + return; + + type &= IRQ_TYPE_SENSE_MASK; + if (type != IRQ_TYPE_NONE) { + int ret; + + ret = __irq_set_trigger(desc, irq, type); + + if (ret) { + WARN(1, "failed to set type for IRQ%d\n", irq); + goto out; + } + } + + irq_percpu_enable(desc, cpu); +out: + irq_put_desc_unlock(desc, flags); +} + +void disable_percpu_irq(unsigned int irq) +{ + unsigned int cpu = smp_processor_id(); + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + + if (!desc) + return; + + irq_percpu_disable(desc, cpu); + irq_put_desc_unlock(desc, flags); +} + +/* + * Internal function to unregister a percpu irqaction. + */ +static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + + WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); + + if (!desc) + return NULL; + + raw_spin_lock_irqsave(&desc->lock, flags); + + action = desc->action; + if (!action || action->percpu_dev_id != dev_id) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + goto bad; + } + + if (!cpumask_empty(desc->percpu_enabled)) { + WARN(1, "percpu IRQ %d still enabled on CPU%d!\n", + irq, cpumask_first(desc->percpu_enabled)); + goto bad; + } + + /* Found it - now remove it from the list of entries: */ + desc->action = NULL; + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + unregister_handler_proc(irq, action); + + module_put(desc->owner); + return action; + +bad: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return NULL; +} + +/** + * remove_percpu_irq - free a per-cpu interrupt + * @irq: Interrupt line to free + * @act: irqaction for the interrupt + * + * Used to remove interrupts statically setup by the early boot process. + */ +void remove_percpu_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc && irq_settings_is_per_cpu_devid(desc)) + __free_percpu_irq(irq, act->percpu_dev_id); +} + +/** + * free_percpu_irq - free an interrupt allocated with request_percpu_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove a percpu interrupt handler. The handler is removed, but + * the interrupt line is not disabled. This must be done on each + * CPU before calling this function. The function does not return + * until any executing interrupts for this IRQ have completed. + * + * This function must not be called from interrupt context. + */ +void free_percpu_irq(unsigned int irq, void __percpu *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return; + + chip_bus_lock(desc); + kfree(__free_percpu_irq(irq, dev_id)); + chip_bus_sync_unlock(desc); +} + +/** + * setup_percpu_irq - setup a per-cpu interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt + * + * Used to statically setup per-cpu interrupts in the early boot process. + */ +int setup_percpu_irq(unsigned int irq, struct irqaction *act) +{ + struct irq_desc *desc = irq_to_desc(irq); + int retval; + + if (!desc || !irq_settings_is_per_cpu_devid(desc)) + return -EINVAL; + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; +} + +/** + * request_percpu_irq - allocate a percpu interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * @devname: An ascii name for the claiming device + * @dev_id: A percpu cookie passed back to the handler function + * + * This call allocates interrupt resources, but doesn't + * automatically enable the interrupt. It has to be done on each + * CPU using enable_percpu_irq(). + * + * Dev_id must be globally unique. It is a per-cpu variable, and + * the handler gets called with the interrupted CPU's instance of + * that variable. + */ +int request_percpu_irq(unsigned int irq, irq_handler_t handler, + const char *devname, void __percpu *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + int retval; + + if (!dev_id) + return -EINVAL; + + desc = irq_to_desc(irq); + if (!desc || !irq_settings_can_request(desc) || + !irq_settings_is_per_cpu_devid(desc)) + return -EINVAL; + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->flags = IRQF_PERCPU; + action->name = devname; + action->percpu_dev_id = dev_id; + + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, action); + chip_bus_sync_unlock(desc); + + if (retval) + kfree(action); + + return retval; +} diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index f76fc00c9877..15e53b1766a6 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -9,6 +9,7 @@ #include <linux/irq.h> #include <linux/module.h> #include <linux/interrupt.h> +#include <linux/syscore_ops.h> #include "internals.h" @@ -39,25 +40,58 @@ void suspend_device_irqs(void) } EXPORT_SYMBOL_GPL(suspend_device_irqs); -/** - * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() - * - * Enable all interrupt lines previously disabled by suspend_device_irqs() that - * have the IRQS_SUSPENDED flag set. - */ -void resume_device_irqs(void) +static void resume_irqs(bool want_early) { struct irq_desc *desc; int irq; for_each_irq_desc(irq, desc) { unsigned long flags; + bool is_early = desc->action && + desc->action->flags & IRQF_EARLY_RESUME; + + if (is_early != want_early) + continue; raw_spin_lock_irqsave(&desc->lock, flags); __enable_irq(desc, irq, true); raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +/** + * irq_pm_syscore_ops - enable interrupt lines early + * + * Enable all interrupt lines with %IRQF_EARLY_RESUME set. + */ +static void irq_pm_syscore_resume(void) +{ + resume_irqs(true); +} + +static struct syscore_ops irq_pm_syscore_ops = { + .resume = irq_pm_syscore_resume, +}; + +static int __init irq_pm_init_ops(void) +{ + register_syscore_ops(&irq_pm_syscore_ops); + return 0; +} + +device_initcall(irq_pm_init_ops); + +/** + * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() + * + * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously + * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag + * set as well as those with %IRQF_FORCE_RESUME. + */ +void resume_device_irqs(void) +{ + resume_irqs(false); +} EXPORT_SYMBOL_GPL(resume_device_irqs); /** diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h index f1667833d444..1162f1030f18 100644 --- a/kernel/irq/settings.h +++ b/kernel/irq/settings.h @@ -13,6 +13,7 @@ enum { _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, _IRQ_NO_BALANCING = IRQ_NO_BALANCING, _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, + _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID, _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, }; @@ -24,6 +25,7 @@ enum { #define IRQ_NOTHREAD GOT_YOU_MORON #define IRQ_NOAUTOEN GOT_YOU_MORON #define IRQ_NESTED_THREAD GOT_YOU_MORON +#define IRQ_PER_CPU_DEVID GOT_YOU_MORON #undef IRQF_MODIFY_MASK #define IRQF_MODIFY_MASK GOT_YOU_MORON @@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) return desc->status_use_accessors & _IRQ_PER_CPU; } +static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_PER_CPU_DEVID; +} + static inline void irq_settings_set_per_cpu(struct irq_desc *desc) { desc->status_use_accessors |= _IRQ_PER_CPU; diff --git a/kernel/kmod.c b/kernel/kmod.c index ddc7644c1305..a4bea97c75b6 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...) atomic_inc(&kmod_concurrent); if (atomic_read(&kmod_concurrent) > max_modprobes) { /* We may be blaming an innocent here, but unlikely */ - if (kmod_loop_msg++ < 5) + if (kmod_loop_msg < 5) { printk(KERN_ERR "request_module: runaway loop modprobe %s\n", module_name); + kmod_loop_msg++; + } atomic_dec(&kmod_concurrent); return -ENOMEM; } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b30fd54eb985..2f193d0ba7f2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -78,10 +78,10 @@ static bool kprobes_all_disarmed; static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; static struct { - spinlock_t lock ____cacheline_aligned_in_smp; + raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; -static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); } @@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - spin_lock(&rp->lock); + raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); - spin_unlock(&rp->lock); + raw_spin_unlock(&rp->lock); } else /* Unregistering */ hlist_add_head(&ri->hlist, head); @@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, __acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spin_lock_irqsave(hlist_lock, *flags); } static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) __acquires(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_lock_irqsave(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_lock_irqsave(hlist_lock, *flags); } void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, @@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, __releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - spinlock_t *hlist_lock; + raw_spinlock_t *hlist_lock; hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } static void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) __releases(hlist_lock) { - spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - spin_unlock_irqrestore(hlist_lock, *flags); + raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + raw_spin_unlock_irqrestore(hlist_lock, *flags); } /* @@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, /*TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); hlist_del(&ri->hlist); - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); ri->rp = rp; ri->task = current; @@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, kretprobe_table_unlock(hash, &flags); } else { rp->nmissed++; - spin_unlock_irqrestore(&rp->lock, flags); + raw_spin_unlock_irqrestore(&rp->lock, flags); } return 0; } @@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } - spin_lock_init(&rp->lock); + raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); for (i = 0; i < rp->maxactive; i++) { inst = kmalloc(sizeof(struct kretprobe_instance) + @@ -1959,7 +1959,7 @@ static int __init init_kprobes(void) for (i = 0; i < KPROBE_TABLE_SIZE; i++) { INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - spin_lock_init(&(kretprobe_table_locks[i].lock)); + raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); } /* diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 376066e10413..4ac8ebfcab59 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -58,7 +58,7 @@ #include <linux/list.h> #include <linux/stacktrace.h> -static DEFINE_SPINLOCK(latency_lock); +static DEFINE_RAW_SPINLOCK(latency_lock); #define MAXLR 128 static struct latency_record latency_record[MAXLR]; @@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p) if (!latencytop_enabled) return; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&p->latency_record, 0, sizeof(p->latency_record)); p->latency_record_count = 0; - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void clear_global_latency_tracing(void) { unsigned long flags; - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); memset(&latency_record, 0, sizeof(latency_record)); - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static void __sched @@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) lat.max = usecs; store_stacktrace(tsk, &lat); - spin_lock_irqsave(&latency_lock, flags); + raw_spin_lock_irqsave(&latency_lock, flags); account_global_scheduler_latency(tsk, &lat); @@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); out_unlock: - spin_unlock_irqrestore(&latency_lock, flags); + raw_spin_unlock_irqrestore(&latency_lock, flags); } static int lstats_show(struct seq_file *m, void *v) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 91d67ce3a8d5..e69434b070da 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -96,8 +96,13 @@ static int graph_lock(void) static inline int graph_unlock(void) { - if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) { + /* + * The lockdep graph lock isn't locked while we expect it to + * be, we're confused now, bye! + */ return DEBUG_LOCKS_WARN_ON(1); + } current->lockdep_recursion--; arch_spin_unlock(&lockdep_lock); @@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; static inline struct lock_class *hlock_class(struct held_lock *hlock) { if (!hlock->class_idx) { + /* + * Someone passed in garbage, we give up. + */ DEBUG_LOCKS_WARN_ON(1); return NULL; } @@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) */ list_for_each_entry(class, hash_head, hash_entry) { if (class->key == key) { + /* + * Huh! same key, different name? Did someone trample + * on some memory? We're most confused. + */ WARN_ON_ONCE(class->name != lock->name); return class; } @@ -800,6 +812,10 @@ out_unlock_set: else if (subclass < NR_LOCKDEP_CACHING_CLASSES) lock->class_cache[subclass] = class; + /* + * Hash collision, did we smoke some? We found a class with a matching + * hash but the subclass -- which is hashed in -- didn't match. + */ if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) return NULL; @@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock, unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ lock->parent = parent; lock->class->dep_gen_id = lockdep_dependency_gen_id; } @@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock) unsigned long nr; nr = lock - list_entries; - WARN_ON(nr >= nr_list_entries); + WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */ return lock->class->dep_gen_id == lockdep_dependency_gen_id; } @@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, if (debug_locks_silent) return 0; - printk("\n=======================================================\n"); - printk( "[ INFO: possible circular locking dependency detected ]\n"); + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: possible circular locking dependency detected ]\n"); print_kernel_version(); - printk( "-------------------------------------------------------\n"); + printk("-------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret) if (!debug_locks_off_graph_unlock()) return 0; + /* + * Breadth-first-search failed, graph got corrupted? + */ WARN(1, "lockdep bfs error:%d\n", ret); return 0; @@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n======================================================\n"); - printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + printk("\n"); + printk("======================================================\n"); + printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", irqclass, irqclass); print_kernel_version(); - printk( "------------------------------------------------------\n"); + printk("------------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=============================================\n"); - printk( "[ INFO: possible recursive locking detected ]\n"); + printk("\n"); + printk("=============================================\n"); + printk("[ INFO: possible recursive locking detected ]\n"); print_kernel_version(); - printk( "---------------------------------------------\n"); + printk("---------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -1944,6 +1966,11 @@ out_bug: if (!debug_locks_off_graph_unlock()) return 0; + /* + * Clearly we all shouldn't be here, but since we made it we + * can reliable say we messed up our state. See the above two + * gotos for reasons why we could possibly end up here. + */ WARN_ON(1); return 0; @@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct held_lock *hlock_curr, *hlock_next; int i, j; + /* + * We might need to take the graph lock, ensure we've got IRQs + * disabled to make this an IRQ-safe lock.. for recursion reasons + * lockdep won't complain about its own locking errors. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; /* @@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr) hlock = curr->held_locks + i; if (chain_key != hlock->prev_chain_key) { debug_locks_off(); + /* + * We got mighty confused, our chain keys don't match + * with what we expect, someone trample on our task state? + */ WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, @@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr) return; } id = hlock->class_idx - 1; + /* + * Whoops ran out of static storage again? + */ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return; @@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr) } if (chain_key != curr->curr_chain_key) { debug_locks_off(); + /* + * More smoking hash instead of calculating it, damn see these + * numbers float.. I bet that a pink elephant stepped on my memory. + */ WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", curr->lockdep_depth, i, (unsigned long long)chain_key, @@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=================================\n"); - printk( "[ INFO: inconsistent lock state ]\n"); + printk("\n"); + printk("=================================\n"); + printk("[ INFO: inconsistent lock state ]\n"); print_kernel_version(); - printk( "---------------------------------\n"); + printk("---------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - printk("\n=========================================================\n"); - printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + printk("\n"); + printk("=========================================================\n"); + printk("[ INFO: possible irq lock inversion dependency detected ]\n"); print_kernel_version(); - printk( "---------------------------------------------------------\n"); + printk("---------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip) return; } + /* + * We're enabling irqs and according to our state above irqs weren't + * already enabled, yet we find the hardware thinks they are in fact + * enabled.. someone messed up their IRQ state tracing. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; + /* + * See the fine text that goes along with this variable definition. + */ if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) return; + /* + * Can't allow enabling interrupts while in an interrupt handler, + * that's general bad form and such. Recursion, limited stack etc.. + */ if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) return; @@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * So we're supposed to get called after you mask local IRQs, but for + * some reason the hardware doesn't quite think you did a proper job. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * We fancy IRQs being disabled here, see softirq.c, avoids + * funny state and nesting things. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip) if (unlikely(!debug_locks || current->lockdep_recursion)) return; + /* + * We fancy IRQs being disabled here, see softirq.c + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return; @@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip) curr->softirq_disable_ip = ip; curr->softirq_disable_event = ++curr->irq_events; debug_atomic_inc(softirqs_off_events); + /* + * Whoops, we wanted softirqs off, so why aren't they? + */ DEBUG_LOCKS_WARN_ON(!softirq_count()); } else debug_atomic_inc(redundant_softirqs_off); @@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (!(gfp_mask & __GFP_FS)) return; + /* + * Oi! Can't be having __GFP_FS allocations with IRQs disabled. + */ if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; @@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr, return 0; } -#else +#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ static inline int mark_lock_irq(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit) { - WARN_ON(1); + WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ return 1; } @@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask) { } -#endif +#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ /* * Mark a lock with a usage bit, and validate the state transition: @@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->cpu = raw_smp_processor_id(); #endif + /* + * Can't be having no nameless bastards around this place! + */ if (DEBUG_LOCKS_WARN_ON(!name)) { lock->name = "NULL"; return; @@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, lock->name = name; + /* + * No key, no joy, we need to hash something. + */ if (DEBUG_LOCKS_WARN_ON(!key)) return; /* @@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, */ if (!static_obj(key)) { printk("BUG: key %p not in .data!\n", key); + /* + * What it says above ^^^^^, I suggest you read it. + */ DEBUG_LOCKS_WARN_ON(1); return; } @@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (unlikely(!debug_locks)) return 0; + /* + * Lockdep should run with IRQs disabled, otherwise we could + * get an interrupt which would want to take locks, which would + * end up in lockdep and have you got a head-ache already? + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * dependency checks are done) */ depth = curr->lockdep_depth; + /* + * Ran out of static storage for our per-task lock stack again have we? + */ if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) return 0; @@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, } hlock = curr->held_locks + depth; + /* + * Plain impossible, we just registered it and checked it weren't no + * NULL like.. I bet this mushroom I ate was good! + */ if (DEBUG_LOCKS_WARN_ON(!class)) return 0; hlock->class_idx = class_idx; @@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * the hash, not class->key. */ id = class - lock_classes; + /* + * Whoops, we did it again.. ran straight out of our static allocation. + */ if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; if (!depth) { + /* + * How can we have a chain hash when we ain't got no keys?! + */ if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) return 0; chain_head = 1; @@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n=====================================\n"); - printk( "[ BUG: bad unlock balance detected! ]\n"); - printk( "-------------------------------------\n"); + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: bad unlock balance detected! ]\n"); + printk("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, { if (unlikely(!debug_locks)) return 0; + /* + * Lockdep should run with IRQs disabled, recursion, head-ache, etc.. + */ if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return 0; @@ -3120,6 +3225,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) if (!class) return 0; + /* + * References, but not a lock we're actually ref-counting? + * State got messed up, follow the sites that change ->references + * and try to make sense of it. + */ if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) return 0; @@ -3142,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, int i; depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; @@ -3177,6 +3291,10 @@ found_it: return 0; } + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) return 0; return 1; @@ -3201,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr, * of held locks: */ depth = curr->lockdep_depth; + /* + * So we're all set to release this lock.. wait what lock? We don't + * own any locks, you've been drinking again? + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; @@ -3253,6 +3375,10 @@ found_it: return 0; } + /* + * We had N bottles of beer on the wall, we drank one, but now + * there's not N-1 bottles of beer left on the wall... + */ if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) return 0; return 1; @@ -3283,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr, return lock_release_non_nested(curr, lock, ip); curr->lockdep_depth--; + /* + * No more locks, but somehow we've got hash left over, who left it? + */ if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) return 0; @@ -3365,10 +3494,13 @@ static void check_flags(unsigned long flags) * check if not in hardirq contexts: */ if (!hardirq_count()) { - if (softirq_count()) + if (softirq_count()) { + /* like the above, but with softirqs */ DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); - else + } else { + /* lick the above, does it taste good? */ DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } } if (!debug_locks) @@ -3478,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, if (debug_locks_silent) return 0; - printk("\n=================================\n"); - printk( "[ BUG: bad contention detected! ]\n"); - printk( "---------------------------------\n"); + printk("\n"); + printk("=================================\n"); + printk("[ BUG: bad contention detected! ]\n"); + printk("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3506,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) int i, contention_point, contending_point; depth = curr->lockdep_depth; + /* + * Whee, we contended on this lock, except it seems we're not + * actually trying to acquire anything much at all.. + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return; @@ -3555,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) int i, cpu; depth = curr->lockdep_depth; + /* + * Yay, we acquired ownership of this lock we didn't try to + * acquire, how the heck did that happen? + */ if (DEBUG_LOCKS_WARN_ON(!depth)) return; @@ -3759,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock) match |= class == lock->class_cache[j]; if (unlikely(match)) { - if (debug_locks_off_graph_unlock()) + if (debug_locks_off_graph_unlock()) { + /* + * We all just reset everything, how did it match? + */ WARN_ON(1); + } goto out_restore; } } @@ -3839,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, if (debug_locks_silent) return; - printk("\n=========================\n"); - printk( "[ BUG: held lock freed! ]\n"); - printk( "-------------------------\n"); + printk("\n"); + printk("=========================\n"); + printk("[ BUG: held lock freed! ]\n"); + printk("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -3895,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr) if (debug_locks_silent) return; - printk("\n=====================================\n"); - printk( "[ BUG: lock held at task exit time! ]\n"); - printk( "-------------------------------------\n"); + printk("\n"); + printk("=====================================\n"); + printk("[ BUG: lock held at task exit time! ]\n"); + printk("-------------------------------------\n"); printk("%s/%d is exiting with locks still held!\n", curr->comm, task_pid_nr(curr)); lockdep_print_held_locks(curr); @@ -3991,16 +4138,17 @@ void lockdep_sys_exit(void) if (unlikely(curr->lockdep_depth)) { if (!debug_locks_off()) return; - printk("\n================================================\n"); - printk( "[ BUG: lock held when returning to user space! ]\n"); - printk( "------------------------------------------------\n"); + printk("\n"); + printk("================================================\n"); + printk("[ BUG: lock held when returning to user space! ]\n"); + printk("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); } } -void lockdep_rcu_dereference(const char *file, const int line) +void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { struct task_struct *curr = current; @@ -4009,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line) return; #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ - printk("\n===================================================\n"); - printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); - printk( "---------------------------------------------------\n"); - printk("%s:%d invoked rcu_dereference_check() without protection!\n", - file, line); + printk("\n"); + printk("===============================\n"); + printk("[ INFO: suspicious RCU usage. ]\n"); + printk("-------------------------------\n"); + printk("%s:%d %s!\n", file, line, s); printk("\nother info that might help us debug this:\n\n"); printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); } -EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); +EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); diff --git a/kernel/module.c b/kernel/module.c index 04379f92f843..93342d992f34 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3487,50 +3487,3 @@ void module_layout(struct module *mod, } EXPORT_SYMBOL(module_layout); #endif - -#ifdef CONFIG_TRACEPOINTS -void module_update_tracepoints(void) -{ - struct module *mod; - - mutex_lock(&module_mutex); - list_for_each_entry(mod, &modules, list) - if (!mod->taints) - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); - mutex_unlock(&module_mutex); -} - -/* - * Returns 0 if current not found. - * Returns 1 if current found. - */ -int module_get_iter_tracepoints(struct tracepoint_iter *iter) -{ - struct module *iter_mod; - int found = 0; - - mutex_lock(&module_mutex); - list_for_each_entry(iter_mod, &modules, list) { - if (!iter_mod->taints) { - /* - * Sorted module list - */ - if (iter_mod < iter->module) - continue; - else if (iter_mod > iter->module) - iter->tracepoint = NULL; - found = tracepoint_get_iter_range(&iter->tracepoint, - iter_mod->tracepoints_ptrs, - iter_mod->tracepoints_ptrs - + iter_mod->num_tracepoints); - if (found) { - iter->module = iter_mod; - break; - } - } - } - mutex_unlock(&module_mutex); - return found; -} -#endif diff --git a/kernel/params.c b/kernel/params.c index 22df3e0d142a..821788947e40 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param) } } -static inline char dash2underscore(char c) +static char dash2underscore(char c) { if (c == '-') return '_'; return c; } -static inline int parameq(const char *input, const char *paramname) +bool parameqn(const char *a, const char *b, size_t n) { - unsigned int i; - for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) - if (input[i] == '\0') - return 1; - return 0; + size_t i; + + for (i = 0; i < n; i++) { + if (dash2underscore(a[i]) != dash2underscore(b[i])) + return false; + } + return true; +} + +bool parameq(const char *a, const char *b) +{ + return parameqn(a, b, strlen(a)+1); } static int parse_one(char *param, diff --git a/kernel/pid.c b/kernel/pid.c index e432057f3b21..8cafe7e72ad2 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held()); + rcu_lockdep_assert(rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock()" + " protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index c8008dd58ef2..e7cb76dc18f5 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) struct task_cputime sum; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); if (!cputimer->running) { - cputimer->running = 1; /* * The POSIX timer interface allows for absolute time expiry * values through the TIMER_ABSTIME flag, therefore we have @@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) * it. */ thread_group_cputime(tsk, &sum); + raw_spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; update_gt_cputime(&cputimer->cputime, &sum); - } + } else + raw_spin_lock_irqsave(&cputimer->lock, flags); *times = cputimer->cputime; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } /* @@ -998,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig) struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; - spin_lock_irqsave(&cputimer->lock, flags); + raw_spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; - spin_unlock_irqrestore(&cputimer->lock, flags); + raw_spin_unlock_irqrestore(&cputimer->lock, flags); } static u32 onecputick; @@ -1290,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) if (sig->cputimer.running) { struct task_cputime group_sample; - spin_lock(&sig->cputimer.lock); + raw_spin_lock(&sig->cputimer.lock); group_sample = sig->cputimer.cputime; - spin_unlock(&sig->cputimer.lock); + raw_spin_unlock(&sig->cputimer.lock); if (task_cputime_expired(&group_sample, &sig->cputime_expires)) return 1; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 3744c594b19b..cedd9982306a 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,6 +27,7 @@ config HIBERNATION select HIBERNATE_CALLBACKS select LZO_COMPRESS select LZO_DECOMPRESS + select CRC32 ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the @@ -65,6 +66,9 @@ config HIBERNATION For more information take a look at <file:Documentation/power/swsusp.txt>. +config ARCH_SAVE_PAGE_KEYS + bool + config PM_STD_PARTITION string "Default resume partition" depends on HIBERNATION diff --git a/kernel/power/Makefile b/kernel/power/Makefile index c5ebc6a90643..07e0e28ffba7 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -1,8 +1,8 @@ ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG -obj-$(CONFIG_PM) += main.o -obj-$(CONFIG_PM_SLEEP) += console.o +obj-$(CONFIG_PM) += main.o qos.o +obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o obj-$(CONFIG_FREEZER) += process.o obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o diff --git a/kernel/power/console.c b/kernel/power/console.c index 218e5af90156..b1dc456474b5 100644 --- a/kernel/power/console.c +++ b/kernel/power/console.c @@ -1,5 +1,5 @@ /* - * drivers/power/process.c - Functions for saving/restoring console. + * Functions for saving/restoring console. * * Originally from swsusp. */ @@ -10,7 +10,6 @@ #include <linux/module.h> #include "power.h" -#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) static int orig_fgconsole, orig_kmsg; @@ -32,4 +31,3 @@ void pm_restore_console(void) vt_kmsg_redirect(orig_kmsg); } } -#endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 8f7b1db1ece1..1c53f7fad5f7 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -14,6 +14,7 @@ #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> +#include <linux/async.h> #include <linux/kmod.h> #include <linux/delay.h> #include <linux/fs.h> @@ -29,12 +30,14 @@ #include "power.h" -static int nocompress = 0; -static int noresume = 0; +static int nocompress; +static int noresume; +static int resume_wait; +static int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; -int in_suspend __nosavedata = 0; +int in_suspend __nosavedata; enum { HIBERNATION_INVALID, @@ -334,13 +337,17 @@ int hibernation_snapshot(int platform_mode) if (error) goto Close; - error = dpm_prepare(PMSG_FREEZE); - if (error) - goto Complete_devices; - /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); if (error) + goto Close; + + error = freeze_kernel_threads(); + if (error) + goto Close; + + error = dpm_prepare(PMSG_FREEZE); + if (error) goto Complete_devices; suspend_console(); @@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode) * @platform_mode: If set, use platform driver to prepare for the transition. * * This routine must be called with pm_mutex held. If it is successful, control - * reappears in the restored target kernel in hibernation_snaphot(). + * reappears in the restored target kernel in hibernation_snapshot(). */ int hibernation_restore(int platform_mode) { @@ -650,6 +657,9 @@ int hibernate(void) flags |= SF_PLATFORM_MODE; if (nocompress) flags |= SF_NOCOMPRESS_MODE; + else + flags |= SF_CRC32_MODE; + pr_debug("PM: writing image.\n"); error = swsusp_write(flags); swsusp_free(); @@ -724,6 +734,12 @@ static int software_resume(void) pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + if (resume_delay) { + printk(KERN_INFO "Waiting %dsec before reading resume device...\n", + resume_delay); + ssleep(resume_delay); + } + /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); if (!swsusp_resume_device) { @@ -732,6 +748,13 @@ static int software_resume(void) * to wait for this to finish. */ wait_for_device_probe(); + + if (resume_wait) { + while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0) + msleep(10); + async_synchronize_full(); + } + /* * We can't depend on SCSI devices being available after loading * one of their modules until scsi_complete_async_scans() is @@ -1060,7 +1083,21 @@ static int __init noresume_setup(char *str) return 1; } +static int __init resumewait_setup(char *str) +{ + resume_wait = 1; + return 1; +} + +static int __init resumedelay_setup(char *str) +{ + resume_delay = simple_strtoul(str, NULL, 0); + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); __setup("hibernate=", hibernate_setup); +__setup("resumewait", resumewait_setup); +__setup("resumedelay=", resumedelay_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index 6c601f871964..a52e88425a31 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -12,6 +12,8 @@ #include <linux/string.h> #include <linux/resume-trace.h> #include <linux/workqueue.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> #include "power.h" @@ -131,6 +133,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(pm_test); #endif /* CONFIG_PM_DEBUG */ +#ifdef CONFIG_DEBUG_FS +static char *suspend_step_name(enum suspend_stat_step step) +{ + switch (step) { + case SUSPEND_FREEZE: + return "freeze"; + case SUSPEND_PREPARE: + return "prepare"; + case SUSPEND_SUSPEND: + return "suspend"; + case SUSPEND_SUSPEND_NOIRQ: + return "suspend_noirq"; + case SUSPEND_RESUME_NOIRQ: + return "resume_noirq"; + case SUSPEND_RESUME: + return "resume"; + default: + return ""; + } +} + +static int suspend_stats_show(struct seq_file *s, void *unused) +{ + int i, index, last_dev, last_errno, last_step; + + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; + last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1; + last_errno %= REC_FAILED_NUM; + last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1; + last_step %= REC_FAILED_NUM; + seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n" + "%s: %d\n%s: %d\n%s: %d\n%s: %d\n", + "success", suspend_stats.success, + "fail", suspend_stats.fail, + "failed_freeze", suspend_stats.failed_freeze, + "failed_prepare", suspend_stats.failed_prepare, + "failed_suspend", suspend_stats.failed_suspend, + "failed_suspend_noirq", + suspend_stats.failed_suspend_noirq, + "failed_resume", suspend_stats.failed_resume, + "failed_resume_noirq", + suspend_stats.failed_resume_noirq); + seq_printf(s, "failures:\n last_failed_dev:\t%-s\n", + suspend_stats.failed_devs[last_dev]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_dev + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_stats.failed_devs[index]); + } + seq_printf(s, " last_failed_errno:\t%-d\n", + suspend_stats.errno[last_errno]); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_errno + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-d\n", + suspend_stats.errno[index]); + } + seq_printf(s, " last_failed_step:\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[last_step])); + for (i = 1; i < REC_FAILED_NUM; i++) { + index = last_step + REC_FAILED_NUM - i; + index %= REC_FAILED_NUM; + seq_printf(s, "\t\t\t%-s\n", + suspend_step_name( + suspend_stats.failed_steps[index])); + } + + return 0; +} + +static int suspend_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_stats_show, NULL); +} + +static const struct file_operations suspend_stats_operations = { + .open = suspend_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init pm_debugfs_init(void) +{ + debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO, + NULL, NULL, &suspend_stats_operations); + return 0; +} + +late_initcall(pm_debugfs_init); +#endif /* CONFIG_DEBUG_FS */ + #endif /* CONFIG_PM_SLEEP */ struct kobject *power_kobj; @@ -194,6 +291,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, } if (state < PM_SUSPEND_MAX && *s) error = enter_state(state); + if (error) { + suspend_stats.fail++; + dpm_save_failed_errno(error); + } else + suspend_stats.success++; #endif Exit: diff --git a/kernel/power/power.h b/kernel/power/power.h index 9a00a0a26280..23a2db1ec442 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void); */ #define SF_PLATFORM_MODE 1 #define SF_NOCOMPRESS_MODE 2 +#define SF_CRC32_MODE 4 /* kernel/power/hibernate.c */ extern int swsusp_check(void); @@ -228,7 +229,8 @@ extern int pm_test_level; #ifdef CONFIG_SUSPEND_FREEZER static inline int suspend_freeze_processes(void) { - return freeze_processes(); + int error = freeze_processes(); + return error ? : freeze_kernel_threads(); } static inline void suspend_thaw_processes(void) diff --git a/kernel/power/process.c b/kernel/power/process.c index 0cf3a27a6c9d..addbbe5531bc 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only) } /** - * freeze_processes - tell processes to enter the refrigerator + * freeze_processes - Signal user space processes to enter the refrigerator. */ int freeze_processes(void) { @@ -143,20 +143,30 @@ int freeze_processes(void) printk("Freezing user space processes ... "); error = try_to_freeze_tasks(true); - if (error) - goto Exit; - printk("done.\n"); + if (!error) { + printk("done."); + oom_killer_disable(); + } + printk("\n"); + BUG_ON(in_atomic()); + + return error; +} + +/** + * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator. + */ +int freeze_kernel_threads(void) +{ + int error; printk("Freezing remaining freezable tasks ... "); error = try_to_freeze_tasks(false); - if (error) - goto Exit; - printk("done."); + if (!error) + printk("done."); - oom_killer_disable(); - Exit: - BUG_ON(in_atomic()); printk("\n"); + BUG_ON(in_atomic()); return error; } diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c index 37f05d0f0793..1c1797dd1d1d 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/power/qos.c @@ -29,7 +29,7 @@ /*#define DEBUG*/ -#include <linux/pm_qos_params.h> +#include <linux/pm_qos.h> #include <linux/sched.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -45,62 +45,57 @@ #include <linux/uaccess.h> /* - * locking rule: all changes to requests or notifiers lists + * locking rule: all changes to constraints or notifiers lists * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -enum pm_qos_type { - PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ -}; - -/* - * Note: The lockless read path depends on the CPU accessing - * target_value atomically. Atomic access is only guaranteed on all CPU - * types linux supports for 32 bit quantites - */ struct pm_qos_object { - struct plist_head requests; - struct blocking_notifier_head *notifiers; + struct pm_qos_constraints *constraints; struct miscdevice pm_qos_power_miscdev; char *name; - s32 target_value; /* Do not change to 64 bit */ - s32 default_value; - enum pm_qos_type type; }; static DEFINE_SPINLOCK(pm_qos_lock); static struct pm_qos_object null_pm_qos; + static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); -static struct pm_qos_object cpu_dma_pm_qos = { - .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), - .notifiers = &cpu_dma_lat_notifier, - .name = "cpu_dma_latency", +static struct pm_qos_constraints cpu_dma_constraints = { + .list = PLIST_HEAD_INIT(cpu_dma_constraints.list), .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, .type = PM_QOS_MIN, + .notifiers = &cpu_dma_lat_notifier, +}; +static struct pm_qos_object cpu_dma_pm_qos = { + .constraints = &cpu_dma_constraints, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); -static struct pm_qos_object network_lat_pm_qos = { - .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), - .notifiers = &network_lat_notifier, - .name = "network_latency", +static struct pm_qos_constraints network_lat_constraints = { + .list = PLIST_HEAD_INIT(network_lat_constraints.list), .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, - .type = PM_QOS_MIN + .type = PM_QOS_MIN, + .notifiers = &network_lat_notifier, +}; +static struct pm_qos_object network_lat_pm_qos = { + .constraints = &network_lat_constraints, + .name = "network_latency", }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); -static struct pm_qos_object network_throughput_pm_qos = { - .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), - .notifiers = &network_throughput_notifier, - .name = "network_throughput", +static struct pm_qos_constraints network_tput_constraints = { + .list = PLIST_HEAD_INIT(network_tput_constraints.list), .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, .type = PM_QOS_MAX, + .notifiers = &network_throughput_notifier, +}; +static struct pm_qos_object network_throughput_pm_qos = { + .constraints = &network_tput_constraints, + .name = "network_throughput", }; @@ -127,17 +122,17 @@ static const struct file_operations pm_qos_power_fops = { }; /* unlocked internal variant */ -static inline int pm_qos_get_value(struct pm_qos_object *o) +static inline int pm_qos_get_value(struct pm_qos_constraints *c) { - if (plist_head_empty(&o->requests)) - return o->default_value; + if (plist_head_empty(&c->list)) + return c->default_value; - switch (o->type) { + switch (c->type) { case PM_QOS_MIN: - return plist_first(&o->requests)->prio; + return plist_first(&c->list)->prio; case PM_QOS_MAX: - return plist_last(&o->requests)->prio; + return plist_last(&c->list)->prio; default: /* runtime check for not using enum */ @@ -145,69 +140,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o) } } -static inline s32 pm_qos_read_value(struct pm_qos_object *o) +s32 pm_qos_read_value(struct pm_qos_constraints *c) { - return o->target_value; + return c->target_value; } -static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) +static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value) { - o->target_value = value; + c->target_value = value; } -static void update_target(struct pm_qos_object *o, struct plist_node *node, - int del, int value) +/** + * pm_qos_update_target - manages the constraints list and calls the notifiers + * if needed + * @c: constraints data struct + * @node: request to add to the list, to update or to remove + * @action: action to take on the constraints list + * @value: value of the request to add or update + * + * This function returns 1 if the aggregated constraint value has changed, 0 + * otherwise. + */ +int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node, + enum pm_qos_req_action action, int value) { unsigned long flags; - int prev_value, curr_value; + int prev_value, curr_value, new_value; spin_lock_irqsave(&pm_qos_lock, flags); - prev_value = pm_qos_get_value(o); - /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ - if (value != PM_QOS_DEFAULT_VALUE) { + prev_value = pm_qos_get_value(c); + if (value == PM_QOS_DEFAULT_VALUE) + new_value = c->default_value; + else + new_value = value; + + switch (action) { + case PM_QOS_REMOVE_REQ: + plist_del(node, &c->list); + break; + case PM_QOS_UPDATE_REQ: /* * to change the list, we atomically remove, reinit * with new value and add, then see if the extremal * changed */ - plist_del(node, &o->requests); - plist_node_init(node, value); - plist_add(node, &o->requests); - } else if (del) { - plist_del(node, &o->requests); - } else { - plist_add(node, &o->requests); + plist_del(node, &c->list); + case PM_QOS_ADD_REQ: + plist_node_init(node, new_value); + plist_add(node, &c->list); + break; + default: + /* no action */ + ; } - curr_value = pm_qos_get_value(o); - pm_qos_set_value(o, curr_value); + + curr_value = pm_qos_get_value(c); + pm_qos_set_value(c, curr_value); + spin_unlock_irqrestore(&pm_qos_lock, flags); - if (prev_value != curr_value) - blocking_notifier_call_chain(o->notifiers, + if (prev_value != curr_value) { + blocking_notifier_call_chain(c->notifiers, (unsigned long)curr_value, NULL); -} - -static int register_pm_qos_misc(struct pm_qos_object *qos) -{ - qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; - qos->pm_qos_power_miscdev.name = qos->name; - qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; - - return misc_register(&qos->pm_qos_power_miscdev); -} - -static int find_pm_qos_object_by_minor(int minor) -{ - int pm_qos_class; - - for (pm_qos_class = 0; - pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { - if (minor == - pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) - return pm_qos_class; + return 1; + } else { + return 0; } - return -1; } /** @@ -218,11 +217,11 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return pm_qos_read_value(pm_qos_array[pm_qos_class]); + return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints); } EXPORT_SYMBOL_GPL(pm_qos_request); -int pm_qos_request_active(struct pm_qos_request_list *req) +int pm_qos_request_active(struct pm_qos_request *req) { return req->pm_qos_class != 0; } @@ -230,40 +229,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active); /** * pm_qos_add_request - inserts new qos request into the list - * @dep: pointer to a preallocated handle + * @req: pointer to a preallocated handle * @pm_qos_class: identifies which list of qos request to use * @value: defines the qos request * * This function inserts a new entry in the pm_qos_class list of requested qos * performance characteristics. It recomputes the aggregate QoS expectations - * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * for the pm_qos_class of parameters and initializes the pm_qos_request * handle. Caller needs to save this handle for later use in updates and * removal. */ -void pm_qos_add_request(struct pm_qos_request_list *dep, +void pm_qos_add_request(struct pm_qos_request *req, int pm_qos_class, s32 value) { - struct pm_qos_object *o = pm_qos_array[pm_qos_class]; - int new_value; + if (!req) /*guard against callers passing in null */ + return; - if (pm_qos_request_active(dep)) { + if (pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); return; } - if (value == PM_QOS_DEFAULT_VALUE) - new_value = o->default_value; - else - new_value = value; - plist_node_init(&dep->list, new_value); - dep->pm_qos_class = pm_qos_class; - update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); + req->pm_qos_class = pm_qos_class; + pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints, + &req->node, PM_QOS_ADD_REQ, value); } EXPORT_SYMBOL_GPL(pm_qos_add_request); /** * pm_qos_update_request - modifies an existing qos request - * @pm_qos_req : handle to list element holding a pm_qos request to use + * @req : handle to list element holding a pm_qos request to use * @value: defines the qos request * * Updates an existing qos request for the pm_qos_class of parameters along @@ -271,56 +266,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * * Attempts are made to make this code callable on hot code paths. */ -void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, +void pm_qos_update_request(struct pm_qos_request *req, s32 new_value) { - s32 temp; - struct pm_qos_object *o; - - if (!pm_qos_req) /*guard against callers passing in null */ + if (!req) /*guard against callers passing in null */ return; - if (!pm_qos_request_active(pm_qos_req)) { + if (!pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); return; } - o = pm_qos_array[pm_qos_req->pm_qos_class]; - - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = o->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->list.prio) - update_target(o, &pm_qos_req->list, 0, temp); + if (new_value != req->node.prio) + pm_qos_update_target( + pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_UPDATE_REQ, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); /** * pm_qos_remove_request - modifies an existing qos request - * @pm_qos_req: handle to request list element + * @req: handle to request list element * - * Will remove pm qos request from the list of requests and + * Will remove pm qos request from the list of constraints and * recompute the current target value for the pm_qos_class. Call this * on slow code paths. */ -void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) +void pm_qos_remove_request(struct pm_qos_request *req) { - struct pm_qos_object *o; - - if (pm_qos_req == NULL) + if (!req) /*guard against callers passing in null */ return; /* silent return to keep pcm code cleaner */ - if (!pm_qos_request_active(pm_qos_req)) { + if (!pm_qos_request_active(req)) { WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); return; } - o = pm_qos_array[pm_qos_req->pm_qos_class]; - update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); - memset(pm_qos_req, 0, sizeof(*pm_qos_req)); + pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints, + &req->node, PM_QOS_REMOVE_REQ, + PM_QOS_DEFAULT_VALUE); + memset(req, 0, sizeof(*req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -337,7 +323,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) int retval; retval = blocking_notifier_chain_register( - pm_qos_array[pm_qos_class]->notifiers, notifier); + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); return retval; } @@ -356,19 +343,43 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) int retval; retval = blocking_notifier_chain_unregister( - pm_qos_array[pm_qos_class]->notifiers, notifier); + pm_qos_array[pm_qos_class]->constraints->notifiers, + notifier); return retval; } EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); +/* User space interface to PM QoS classes via misc devices */ +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = 0; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + static int pm_qos_power_open(struct inode *inode, struct file *filp) { long pm_qos_class; pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); + struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) return -ENOMEM; @@ -383,7 +394,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) static int pm_qos_power_release(struct inode *inode, struct file *filp) { - struct pm_qos_request_list *req; + struct pm_qos_request *req; req = filp->private_data; pm_qos_remove_request(req); @@ -398,17 +409,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, { s32 value; unsigned long flags; - struct pm_qos_object *o; - struct pm_qos_request_list *pm_qos_req = filp->private_data; + struct pm_qos_request *req = filp->private_data; - if (!pm_qos_req) + if (!req) return -EINVAL; - if (!pm_qos_request_active(pm_qos_req)) + if (!pm_qos_request_active(req)) return -EINVAL; - o = pm_qos_array[pm_qos_req->pm_qos_class]; spin_lock_irqsave(&pm_qos_lock, flags); - value = pm_qos_get_value(o); + value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints); spin_unlock_irqrestore(&pm_qos_lock, flags); return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); @@ -418,7 +427,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos) { s32 value; - struct pm_qos_request_list *pm_qos_req; + struct pm_qos_request *req; if (count == sizeof(s32)) { if (copy_from_user(&value, buf, sizeof(s32))) @@ -449,8 +458,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, return -EINVAL; } - pm_qos_req = filp->private_data; - pm_qos_update_request(pm_qos_req, value); + req = filp->private_data; + pm_qos_update_request(req, value); return count; } diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 06efa54f93d6..cbe2c1441392 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void) count += highmem; count -= totalreserve_pages; + /* Add number of pages required for page keys (s390 only). */ + size += page_key_additional_pages(saveable); + /* Compute the maximum number of saveable pages to leave in memory. */ max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); @@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm) buf[j] = memory_bm_next_pfn(bm); if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Save page key for data page (s390 only). */ + page_key_read(buf + j); } } @@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; + /* Extract and buffer page key for data page (s390 only). */ + page_key_memorize(buf + j); + if (memory_bm_pfn_present(bm, buf[j])) memory_bm_set_bit(bm, buf[j]); else @@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle) if (error) return error; + /* Allocate buffer for page keys. */ + error = page_key_alloc(nr_copy_pages); + if (error) + return error; + } else if (handle->cur <= nr_meta_pages + 1) { error = unpack_orig_pfns(buffer, ©_bm); if (error) @@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle) } } else { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); handle->buffer = get_buffer(&orig_bm, &ca); if (IS_ERR(handle->buffer)) return PTR_ERR(handle->buffer); @@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle) void snapshot_write_finalize(struct snapshot_handle *handle) { copy_last_highmem_page(); + /* Restore page key for data page (s390 only). */ + page_key_write(handle->buffer); + page_key_free(); /* Free only if we have loaded the image entirely */ if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b6b71ad2208f..fdd4263b995d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -104,7 +104,10 @@ static int suspend_prepare(void) goto Finish; error = suspend_freeze_processes(); - if (!error) + if (error) { + suspend_stats.failed_freeze++; + dpm_save_failed_step(SUSPEND_FREEZE); + } else return 0; suspend_thaw_processes(); @@ -315,8 +318,16 @@ int enter_state(suspend_state_t state) */ int pm_suspend(suspend_state_t state) { - if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX) - return enter_state(state); + int ret; + if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) { + ret = enter_state(state); + if (ret) { + suspend_stats.fail++; + dpm_save_failed_errno(ret); + } else + suspend_stats.success++; + return ret; + } return -EINVAL; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 7c97c3a0eee3..11a594c4ba25 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -27,6 +27,10 @@ #include <linux/slab.h> #include <linux/lzo.h> #include <linux/vmalloc.h> +#include <linux/cpumask.h> +#include <linux/atomic.h> +#include <linux/kthread.h> +#include <linux/crc32.h> #include "power.h" @@ -43,8 +47,7 @@ * allocated and populated one at a time, so we only need one memory * page to set up the entire structure. * - * During resume we also only need to use one swap_map_page structure - * at a time. + * During resume we pick up all swap_map_page structures into a list. */ #define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) @@ -54,6 +57,11 @@ struct swap_map_page { sector_t next_swap; }; +struct swap_map_page_list { + struct swap_map_page *map; + struct swap_map_page_list *next; +}; + /** * The swap_map_handle structure is used for handling swap in * a file-alike way @@ -61,13 +69,18 @@ struct swap_map_page { struct swap_map_handle { struct swap_map_page *cur; + struct swap_map_page_list *maps; sector_t cur_swap; sector_t first_sector; unsigned int k; + unsigned long nr_free_pages, written; + u32 crc32; }; struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) - + sizeof(u32)]; + u32 crc32; sector_t image; unsigned int flags; /* Flags to pass to the "boot" kernel */ char orig_sig[10]; @@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); swsusp_header->image = handle->first_sector; swsusp_header->flags = flags; + if (flags & SF_CRC32_MODE) + swsusp_header->crc32 = handle->crc32; error = hib_bio_write_page(swsusp_resume_block, swsusp_header, NULL); } else { @@ -245,6 +260,7 @@ static int swsusp_swap_check(void) static int write_page(void *buf, sector_t offset, struct bio **bio_chain) { void *src; + int ret; if (!offset) return -ENOSPC; @@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) if (src) { copy_page(src, buf); } else { - WARN_ON_ONCE(1); - bio_chain = NULL; /* Go synchronous */ - src = buf; + ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ + if (ret) + return ret; + src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (src) { + copy_page(src, buf); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; + } } } else { src = buf; @@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle) goto err_rel; } handle->k = 0; + handle->nr_free_pages = nr_free_pages() >> 1; + handle->written = 0; handle->first_sector = handle->cur_swap; return 0; err_rel: @@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, return error; handle->cur->entries[handle->k++] = offset; if (handle->k >= MAP_PAGE_ENTRIES) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; offset = alloc_swapdev_block(root_swap); if (!offset) return -ENOSPC; handle->cur->next_swap = offset; - error = write_page(handle->cur, handle->cur_swap, NULL); + error = write_page(handle->cur, handle->cur_swap, bio_chain); if (error) goto out; clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; } + if (bio_chain && ++handle->written > handle->nr_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + handle->written = 0; + } out: return error; } @@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle, LZO_HEADER, PAGE_SIZE) #define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) +/* Maximum number of threads for compression/decompression. */ +#define LZO_THREADS 3 + +/* Maximum number of pages for read buffering. */ +#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) + + /** * save_image - save the suspend image data */ @@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle, return ret; } +/** + * Structure used for CRC32. + */ +struct crc_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + unsigned run_threads; /* nr current threads */ + wait_queue_head_t go; /* start crc update */ + wait_queue_head_t done; /* crc update done */ + u32 *crc32; /* points to handle's crc32 */ + size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */ + unsigned char *unc[LZO_THREADS]; /* uncompressed data */ +}; + +/** + * CRC32 update function that runs in its own thread. + */ +static int crc32_threadfn(void *data) +{ + struct crc_data *d = data; + unsigned i; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + for (i = 0; i < d->run_threads; i++) + *d->crc32 = crc32_le(*d->crc32, + d->unc[i], *d->unc_len[i]); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} +/** + * Structure used for LZO data compression. + */ +struct cmp_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start compression */ + wait_queue_head_t done; /* compression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ + unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */ +}; + +/** + * Compression function that runs in its own thread. + */ +static int lzo_compress_threadfn(void *data) +{ + struct cmp_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->ret = lzo1x_1_compress(d->unc, d->unc_len, + d->cmp + LZO_HEADER, &d->cmp_len, + d->wrk); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; +} /** * save_image_lzo - Save the suspend image data compressed with LZO. @@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle, struct bio *bio; struct timeval start; struct timeval stop; - size_t off, unc_len, cmp_len; - unsigned char *unc, *cmp, *wrk, *page; + size_t off; + unsigned thr, run_threads, nr_threads; + unsigned char *page = NULL; + struct cmp_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for compression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out_clean; } - wrk = vmalloc(LZO1X_1_MEM_COMPRESS); - if (!wrk) { - printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); - free_page((unsigned long)page); - return -ENOMEM; + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct cmp_data, go)); - unc = vmalloc(LZO_UNC_SIZE); - if (!unc) { - printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - vfree(wrk); - free_page((unsigned long)page); - return -ENOMEM; + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the compression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_compress_threadfn, + &data[thr], + "image_compress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start compression threads\n"); + ret = -ENOMEM; + goto out_clean; + } } - cmp = vmalloc(LZO_CMP_SIZE); - if (!cmp) { - printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); - vfree(unc); - vfree(wrk); - free_page((unsigned long)page); - return -ENOMEM; + /* + * Adjust number of free pages after all allocations have been done. + * We don't want to run out of pages when writing. + */ + handle->nr_free_pages = nr_free_pages() >> 1; + + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; + } + + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; } printk(KERN_INFO + "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", - nr_to_write); + nr_threads, nr_to_write); m = nr_to_write / 100; if (!m) m = 1; @@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for (;;) { - for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { - ret = snapshot_read_next(snapshot); - if (ret < 0) - goto out_finish; - - if (!ret) + for (thr = 0; thr < nr_threads; thr++) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(data[thr].unc + off, + data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", + nr_pages / m); + nr_pages++; + } + if (!off) break; - memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + data[thr].unc_len = off; - if (!(nr_pages % m)) - printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - if (!off) + if (!thr) break; - unc_len = off; - ret = lzo1x_1_compress(unc, unc_len, - cmp + LZO_HEADER, &cmp_len, wrk); - if (ret < 0) { - printk(KERN_ERR "PM: LZO compression failed\n"); - break; - } + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); - if (unlikely(!cmp_len || - cmp_len > lzo1x_worst_compress(unc_len))) { - printk(KERN_ERR "PM: Invalid LZO compressed length\n"); - ret = -1; - break; - } + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); - *(size_t *)cmp = cmp_len; + ret = data[thr].ret; - /* - * Given we are writing one page at a time to disk, we copy - * that much from the buffer, although the last bit will likely - * be smaller than full page. This is OK - we saved the length - * of the compressed data, so any garbage at the end will be - * discarded when we read it. - */ - for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { - memcpy(page, cmp + off, PAGE_SIZE); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + goto out_finish; + } - ret = swap_write_page(handle, page, &bio); - if (ret) + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(data[thr].unc_len))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; goto out_finish; + } + + *(size_t *)data[thr].cmp = data[thr].cmp_len; + + /* + * Given we are writing one page at a time to disk, we + * copy that much from the buffer, although the last + * bit will likely be smaller than full page. This is + * OK - we saved the length of the compressed data, so + * any garbage at the end will be discarded when we + * read it. + */ + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(page, data[thr].cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } } + + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); } out_finish: @@ -536,16 +737,25 @@ out_finish: do_gettimeofday(&stop); if (!ret) ret = err2; - if (!ret) + if (!ret) { printk(KERN_CONT "\b\b\b\bdone\n"); - else + } else { printk(KERN_CONT "\n"); + } swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); - - vfree(cmp); - vfree(unc); - vfree(wrk); - free_page((unsigned long)page); +out_clean: + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) free_page((unsigned long)page); return ret; } @@ -625,8 +835,15 @@ out_finish: static void release_swap_reader(struct swap_map_handle *handle) { - if (handle->cur) - free_page((unsigned long)handle->cur); + struct swap_map_page_list *tmp; + + while (handle->maps) { + if (handle->maps->map) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + } handle->cur = NULL; } @@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle, unsigned int *flags_p) { int error; + struct swap_map_page_list *tmp, *last; + sector_t offset; *flags_p = swsusp_header->flags; if (!swsusp_header->image) /* how can this happen? */ return -EINVAL; - handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); - if (!handle->cur) - return -ENOMEM; + handle->cur = NULL; + last = handle->maps = NULL; + offset = swsusp_header->image; + while (offset) { + tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); + if (!tmp) { + release_swap_reader(handle); + return -ENOMEM; + } + memset(tmp, 0, sizeof(*tmp)); + if (!handle->maps) + handle->maps = tmp; + if (last) + last->next = tmp; + last = tmp; + + tmp->map = (struct swap_map_page *) + __get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!tmp->map) { + release_swap_reader(handle); + return -ENOMEM; + } - error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); - if (error) { - release_swap_reader(handle); - return error; + error = hib_bio_read_page(offset, tmp->map, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + offset = tmp->map->next_swap; } handle->k = 0; + handle->cur = handle->maps->map; return 0; } @@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, { sector_t offset; int error; + struct swap_map_page_list *tmp; if (!handle->cur) return -EINVAL; @@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf, if (error) return error; if (++handle->k >= MAP_PAGE_ENTRIES) { - error = hib_wait_on_bio_chain(bio_chain); handle->k = 0; - offset = handle->cur->next_swap; - if (!offset) + free_page((unsigned long)handle->maps->map); + tmp = handle->maps; + handle->maps = handle->maps->next; + kfree(tmp); + if (!handle->maps) release_swap_reader(handle); - else if (!error) - error = hib_bio_read_page(offset, handle->cur, NULL); + else + handle->cur = handle->maps->map; } return error; } @@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; struct timeval start; struct timeval stop; struct bio *bio; @@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); for ( ; ; ) { - error = snapshot_write_next(snapshot); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) break; - error = swap_read_page(handle, data_of(*snapshot), &bio); - if (error) + ret = swap_read_page(handle, data_of(*snapshot), &bio); + if (ret) break; if (snapshot->sync_read) - error = hib_wait_on_bio_chain(&bio); - if (error) + ret = hib_wait_on_bio_chain(&bio); + if (ret) break; if (!(nr_pages % m)) printk("\b\b\b\b%3d%%", nr_pages / m); @@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle, } err2 = hib_wait_on_bio_chain(&bio); do_gettimeofday(&stop); - if (!error) - error = err2; - if (!error) { + if (!ret) + ret = err2; + if (!ret) { printk("\b\b\b\bdone\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; } else printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - return error; + return ret; +} + +/** + * Structure used for LZO data decompression. + */ +struct dec_data { + struct task_struct *thr; /* thread */ + atomic_t ready; /* ready to start flag */ + atomic_t stop; /* ready to stop flag */ + int ret; /* return code */ + wait_queue_head_t go; /* start decompression */ + wait_queue_head_t done; /* decompression done */ + size_t unc_len; /* uncompressed length */ + size_t cmp_len; /* compressed length */ + unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */ + unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */ +}; + +/** + * Deompression function that runs in its own thread. + */ +static int lzo_decompress_threadfn(void *data) +{ + struct dec_data *d = data; + + while (1) { + wait_event(d->go, atomic_read(&d->ready) || + kthread_should_stop()); + if (kthread_should_stop()) { + d->thr = NULL; + d->ret = -1; + atomic_set(&d->stop, 1); + wake_up(&d->done); + break; + } + atomic_set(&d->ready, 0); + + d->unc_len = LZO_UNC_SIZE; + d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len, + d->unc, &d->unc_len); + atomic_set(&d->stop, 1); + wake_up(&d->done); + } + return 0; } /** @@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned int nr_to_read) { unsigned int m; - int error = 0; + int ret = 0; + int eof = 0; struct bio *bio; struct timeval start; struct timeval stop; unsigned nr_pages; - size_t i, off, unc_len, cmp_len; - unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; - - for (i = 0; i < LZO_CMP_PAGES; i++) { - page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); - if (!page[i]) { - printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + size_t off; + unsigned i, thr, run_threads, nr_threads; + unsigned ring = 0, pg = 0, ring_size = 0, + have = 0, want, need, asked = 0; + unsigned long read_pages; + unsigned char **page = NULL; + struct dec_data *data = NULL; + struct crc_data *crc = NULL; + + /* + * We'll limit the number of threads for decompression to limit memory + * footprint. + */ + nr_threads = num_online_cpus() - 1; + nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); + + page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + ret = -ENOMEM; + goto out_clean; + } - while (i) - free_page((unsigned long)page[--i]); + data = vmalloc(sizeof(*data) * nr_threads); + if (!data) { + printk(KERN_ERR "PM: Failed to allocate LZO data\n"); + ret = -ENOMEM; + goto out_clean; + } + for (thr = 0; thr < nr_threads; thr++) + memset(&data[thr], 0, offsetof(struct dec_data, go)); - return -ENOMEM; + crc = kmalloc(sizeof(*crc), GFP_KERNEL); + if (!crc) { + printk(KERN_ERR "PM: Failed to allocate crc\n"); + ret = -ENOMEM; + goto out_clean; + } + memset(crc, 0, offsetof(struct crc_data, go)); + + /* + * Start the decompression threads. + */ + for (thr = 0; thr < nr_threads; thr++) { + init_waitqueue_head(&data[thr].go); + init_waitqueue_head(&data[thr].done); + + data[thr].thr = kthread_run(lzo_decompress_threadfn, + &data[thr], + "image_decompress/%u", thr); + if (IS_ERR(data[thr].thr)) { + data[thr].thr = NULL; + printk(KERN_ERR + "PM: Cannot start decompression threads\n"); + ret = -ENOMEM; + goto out_clean; } } - unc = vmalloc(LZO_UNC_SIZE); - if (!unc) { - printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); - - for (i = 0; i < LZO_CMP_PAGES; i++) - free_page((unsigned long)page[i]); - - return -ENOMEM; + /* + * Start the CRC32 thread. + */ + init_waitqueue_head(&crc->go); + init_waitqueue_head(&crc->done); + + handle->crc32 = 0; + crc->crc32 = &handle->crc32; + for (thr = 0; thr < nr_threads; thr++) { + crc->unc[thr] = data[thr].unc; + crc->unc_len[thr] = &data[thr].unc_len; } - cmp = vmalloc(LZO_CMP_SIZE); - if (!cmp) { - printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32"); + if (IS_ERR(crc->thr)) { + crc->thr = NULL; + printk(KERN_ERR "PM: Cannot start CRC32 thread\n"); + ret = -ENOMEM; + goto out_clean; + } - vfree(unc); - for (i = 0; i < LZO_CMP_PAGES; i++) - free_page((unsigned long)page[i]); + /* + * Adjust number of pages for read buffering, in case we are short. + */ + read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; + read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); - return -ENOMEM; + for (i = 0; i < read_pages; i++) { + page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? + __GFP_WAIT | __GFP_HIGH : + __GFP_WAIT); + if (!page[i]) { + if (i < LZO_CMP_PAGES) { + ring_size = i; + printk(KERN_ERR + "PM: Failed to allocate LZO pages\n"); + ret = -ENOMEM; + goto out_clean; + } else { + break; + } + } } + want = ring_size = i; printk(KERN_INFO + "PM: Using %u thread(s) for decompression.\n" "PM: Loading and decompressing image data (%u pages) ... ", - nr_to_read); + nr_threads, nr_to_read); m = nr_to_read / 100; if (!m) m = 1; @@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle, bio = NULL; do_gettimeofday(&start); - error = snapshot_write_next(snapshot); - if (error <= 0) + ret = snapshot_write_next(snapshot); + if (ret <= 0) goto out_finish; - for (;;) { - error = swap_read_page(handle, page[0], NULL); /* sync */ - if (error) - break; - - cmp_len = *(size_t *)page[0]; - if (unlikely(!cmp_len || - cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { - printk(KERN_ERR "PM: Invalid LZO compressed length\n"); - error = -1; - break; + for(;;) { + for (i = 0; !eof && i < want; i++) { + ret = swap_read_page(handle, page[ring], &bio); + if (ret) { + /* + * On real read error, finish. On end of data, + * set EOF flag and just exit the read loop. + */ + if (handle->cur && + handle->cur->entries[handle->k]) { + goto out_finish; + } else { + eof = 1; + break; + } + } + if (++ring >= ring_size) + ring = 0; } + asked += i; + want -= i; - for (off = PAGE_SIZE, i = 1; - off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { - error = swap_read_page(handle, page[i], &bio); - if (error) + /* + * We are out of data, wait for some more. + */ + if (!have) { + if (!asked) + break; + + ret = hib_wait_on_bio_chain(&bio); + if (ret) goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; } - error = hib_wait_on_bio_chain(&bio); /* need all data now */ - if (error) - goto out_finish; - - for (off = 0, i = 0; - off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { - memcpy(cmp + off, page[i], PAGE_SIZE); + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + crc->run_threads = 0; } - unc_len = LZO_UNC_SIZE; - error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, - unc, &unc_len); - if (error < 0) { - printk(KERN_ERR "PM: LZO decompression failed\n"); - break; + for (thr = 0; have && thr < nr_threads; thr++) { + data[thr].cmp_len = *(size_t *)page[pg]; + if (unlikely(!data[thr].cmp_len || + data[thr].cmp_len > + lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR + "PM: Invalid LZO compressed length\n"); + ret = -1; + goto out_finish; + } + + need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER, + PAGE_SIZE); + if (need > have) { + if (eof > 1) { + ret = -1; + goto out_finish; + } + break; + } + + for (off = 0; + off < LZO_HEADER + data[thr].cmp_len; + off += PAGE_SIZE) { + memcpy(data[thr].cmp + off, + page[pg], PAGE_SIZE); + have--; + want++; + if (++pg >= ring_size) + pg = 0; + } + + atomic_set(&data[thr].ready, 1); + wake_up(&data[thr].go); } - if (unlikely(!unc_len || - unc_len > LZO_UNC_SIZE || - unc_len & (PAGE_SIZE - 1))) { - printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); - error = -1; - break; + /* + * Wait for more data while we are decompressing. + */ + if (have < LZO_CMP_PAGES && asked) { + ret = hib_wait_on_bio_chain(&bio); + if (ret) + goto out_finish; + have += asked; + asked = 0; + if (eof) + eof = 2; } - for (off = 0; off < unc_len; off += PAGE_SIZE) { - memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + for (run_threads = thr, thr = 0; thr < run_threads; thr++) { + wait_event(data[thr].done, + atomic_read(&data[thr].stop)); + atomic_set(&data[thr].stop, 0); + + ret = data[thr].ret; - if (!(nr_pages % m)) - printk("\b\b\b\b%3d%%", nr_pages / m); - nr_pages++; + if (ret < 0) { + printk(KERN_ERR + "PM: LZO decompression failed\n"); + goto out_finish; + } - error = snapshot_write_next(snapshot); - if (error <= 0) + if (unlikely(!data[thr].unc_len || + data[thr].unc_len > LZO_UNC_SIZE || + data[thr].unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR + "PM: Invalid LZO uncompressed length\n"); + ret = -1; goto out_finish; + } + + for (off = 0; + off < data[thr].unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), + data[thr].unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + ret = snapshot_write_next(snapshot); + if (ret <= 0) { + crc->run_threads = thr + 1; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); + goto out_finish; + } + } } + + crc->run_threads = thr; + atomic_set(&crc->ready, 1); + wake_up(&crc->go); } out_finish: + if (crc->run_threads) { + wait_event(crc->done, atomic_read(&crc->stop)); + atomic_set(&crc->stop, 0); + } do_gettimeofday(&stop); - if (!error) { + if (!ret) { printk("\b\b\b\bdone\n"); snapshot_write_finalize(snapshot); if (!snapshot_image_loaded(snapshot)) - error = -ENODATA; + ret = -ENODATA; + if (!ret) { + if (swsusp_header->flags & SF_CRC32_MODE) { + if(handle->crc32 != swsusp_header->crc32) { + printk(KERN_ERR + "PM: Invalid image CRC32!\n"); + ret = -ENODATA; + } + } + } } else printk("\n"); swsusp_show_speed(&start, &stop, nr_to_read, "Read"); - - vfree(cmp); - vfree(unc); - for (i = 0; i < LZO_CMP_PAGES; i++) +out_clean: + for (i = 0; i < ring_size; i++) free_page((unsigned long)page[i]); + if (crc) { + if (crc->thr) + kthread_stop(crc->thr); + kfree(crc); + } + if (data) { + for (thr = 0; thr < nr_threads; thr++) + if (data[thr].thr) + kthread_stop(data[thr].thr); + vfree(data); + } + if (page) vfree(page); - return error; + return ret; } /** diff --git a/kernel/printk.c b/kernel/printk.c index 28a40d8171b8..b7da18391c38 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -100,7 +100,7 @@ static int console_locked, console_suspended; * It is also used in interesting ways to provide interlocking in * console_unlock();. */ -static DEFINE_SPINLOCK(logbuf_lock); +static DEFINE_RAW_SPINLOCK(logbuf_lock); #define LOG_BUF_MASK (log_buf_len-1) #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) @@ -212,7 +212,7 @@ void __init setup_log_buf(int early) return; } - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; @@ -230,7 +230,7 @@ void __init setup_log_buf(int early) log_start -= offset; con_start -= offset; log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); pr_info("log_buf_len: %d\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", @@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (error) goto out; i = 0; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); while (!error && (log_start != log_end) && i < len) { c = LOG_BUF(log_start); log_start++; - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,buf); buf++; i++; cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (!error) error = i; break; @@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) count = len; if (count > log_buf_len) count = log_buf_len; - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); if (count > logged_chars) count = logged_chars; if (do_clear) @@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (j + log_buf_len < log_end) break; c = LOG_BUF(j); - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); error = __put_user(c,&buf[count-1-i]); cond_resched(); - spin_lock_irq(&logbuf_lock); + raw_spin_lock_irq(&logbuf_lock); } - spin_unlock_irq(&logbuf_lock); + raw_spin_unlock_irq(&logbuf_lock); if (error) break; error = i; @@ -689,7 +689,7 @@ static void zap_locks(void) oops_timestamp = jiffies; /* If a crash is occurring, make sure we can't deadlock */ - spin_lock_init(&logbuf_lock); + raw_spin_lock_init(&logbuf_lock); /* And make sure that we print immediately */ sema_init(&console_sem, 1); } @@ -802,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu) } } printk_cpu = UINT_MAX; - spin_unlock(&logbuf_lock); if (wake) up(&console_sem); + raw_spin_unlock(&logbuf_lock); return retval; } static const char recursion_bug_msg [] = @@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } lockdep_off(); - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); printk_cpu = this_cpu; if (recursion_bug) { @@ -1257,14 +1257,14 @@ void console_unlock(void) again: for ( ; ; ) { - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; if (con_start == log_end) break; /* Nothing to print */ _con_start = con_start; _log_end = log_end; con_start = log_end; /* Flush */ - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(_con_start, _log_end); start_critical_timings(); @@ -1276,7 +1276,7 @@ again: if (unlikely(exclusive_console)) exclusive_console = NULL; - spin_unlock(&logbuf_lock); + raw_spin_unlock(&logbuf_lock); up(&console_sem); @@ -1286,13 +1286,13 @@ again: * there's a new owner and the console_unlock() from them will do the * flush, no worries. */ - spin_lock(&logbuf_lock); + raw_spin_lock(&logbuf_lock); if (con_start != log_end) retry = 1; - spin_unlock_irqrestore(&logbuf_lock, flags); if (retry && console_trylock()) goto again; + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (wake_klogd) wake_up_klogd(); } @@ -1522,9 +1522,9 @@ void register_console(struct console *newcon) * console_unlock(); will print out the buffered messages * for us. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); con_start = log_start; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); /* * We're about to replay the log buffer. Only do this to the * just-registered console to avoid excessive message spam to @@ -1731,10 +1731,10 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* Theoretically, the log could move on after we do this, but there's not a lot we can do about that. The new messages will overwrite the start of what we dump. */ - spin_lock_irqsave(&logbuf_lock, flags); + raw_spin_lock_irqsave(&logbuf_lock, flags); end = log_end & LOG_BUF_MASK; chars = logged_chars; - spin_unlock_irqrestore(&logbuf_lock, flags); + raw_spin_unlock_irqrestore(&logbuf_lock, flags); if (chars > end) { s1 = log_buf + log_buf_len - chars + end; diff --git a/kernel/rcu.h b/kernel/rcu.h new file mode 100644 index 000000000000..f600868d550d --- /dev/null +++ b/kernel/rcu.h @@ -0,0 +1,85 @@ +/* + * Read-Copy Update definitions shared among RCU implementations. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2011 + * + * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +#ifndef __LINUX_RCU_H +#define __LINUX_RCU_H + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally + * by call_rcu() and rcu callback execution, and are therefore not part of the + * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. + */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +# define STATE_RCU_HEAD_READY 0 +# define STATE_RCU_HEAD_QUEUED 1 + +extern struct debug_obj_descr rcuhead_debug_descr; + +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ + WARN_ON_ONCE((unsigned long)head & 0x3); + debug_object_activate(head, &rcuhead_debug_descr); + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_READY, + STATE_RCU_HEAD_QUEUED); +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ + debug_object_active_state(head, &rcuhead_debug_descr, + STATE_RCU_HEAD_QUEUED, + STATE_RCU_HEAD_READY); + debug_object_deactivate(head, &rcuhead_debug_descr); +} +#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +static inline void debug_rcu_head_queue(struct rcu_head *head) +{ +} + +static inline void debug_rcu_head_unqueue(struct rcu_head *head) +{ +} +#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +extern void kfree(const void *); + +static inline void __rcu_reclaim(char *rn, struct rcu_head *head) +{ + unsigned long offset = (unsigned long)head->func; + + if (__is_kfree_rcu_offset(offset)) { + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + kfree((void *)head - offset); + } else { + RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + head->func(head); + } +} + +#endif /* __LINUX_RCU_H */ diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index ddddb320be61..ca0d23b6b3e8 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -46,6 +46,11 @@ #include <linux/module.h> #include <linux/hardirq.h> +#define CREATE_TRACE_POINTS +#include <trace/events/rcu.h> + +#include "rcu.h" + #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = @@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; + /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. */ -void wakeme_after_rcu(struct rcu_head *head) +static void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; @@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head *head) complete(&rcu->completion); } +void wait_rcu_gp(call_rcu_func_t crf) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + crf(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(wait_rcu_gp); + #ifdef CONFIG_PROVE_RCU /* * wrapper function to avoid #include problems. diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 7bbac7d0f5ab..da775c87f27f 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -37,16 +37,17 @@ #include <linux/cpu.h> #include <linux/prefetch.h> -/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ -static struct task_struct *rcu_kthread_task; -static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); -static unsigned long have_rcu_kthread_work; +#ifdef CONFIG_RCU_TRACE +#include <trace/events/rcu.h> +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +#include "rcu.h" /* Forward declarations for rcutiny_plugin.h. */ struct rcu_ctrlblk; -static void invoke_rcu_kthread(void); -static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); -static int rcu_kthread(void *arg); +static void invoke_rcu_callbacks(void); +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static void rcu_process_callbacks(struct softirq_action *unused); static void __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_ctrlblk *rcp); @@ -96,16 +97,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) } /* - * Wake up rcu_kthread() to process callbacks now eligible for invocation - * or to boost readers. - */ -static void invoke_rcu_kthread(void) -{ - have_rcu_kthread_work = 1; - wake_up(&rcu_kthread_wq); -} - -/* * Record an rcu quiescent state. And an rcu_bh quiescent state while we * are at it, given that any rcu quiescent state is also an rcu_bh * quiescent state. Use "+" instead of "||" to defeat short circuiting. @@ -117,7 +108,7 @@ void rcu_sched_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_sched_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); local_irq_restore(flags); } @@ -130,7 +121,7 @@ void rcu_bh_qs(int cpu) local_irq_save(flags); if (rcu_qsctr_help(&rcu_bh_ctrlblk)) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); local_irq_restore(flags); } @@ -154,18 +145,23 @@ void rcu_check_callbacks(int cpu, int user) * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure * whose grace period has elapsed. */ -static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) +static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { + char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; RCU_TRACE(int cb_count = 0); /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) + if (&rcp->rcucblist == rcp->donetail) { + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, 0)); return; + } /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -176,49 +172,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ + RCU_TRACE(rn = rcp->name); while (list) { next = list->next; prefetch(next); debug_rcu_head_unqueue(list); local_bh_disable(); - __rcu_reclaim(list); + __rcu_reclaim(rn, list); local_bh_enable(); list = next; RCU_TRACE(cb_count++); } RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count)); } -/* - * This kthread invokes RCU callbacks whose grace periods have - * elapsed. It is awakened as needed, and takes the place of the - * RCU_SOFTIRQ that was used previously for this purpose. - * This is a kthread, but it is never stopped, at least not until - * the system goes down. - */ -static int rcu_kthread(void *arg) +static void rcu_process_callbacks(struct softirq_action *unused) { - unsigned long work; - unsigned long morework; - unsigned long flags; - - for (;;) { - wait_event_interruptible(rcu_kthread_wq, - have_rcu_kthread_work != 0); - morework = rcu_boost(); - local_irq_save(flags); - work = have_rcu_kthread_work; - have_rcu_kthread_work = morework; - local_irq_restore(flags); - if (work) { - rcu_process_callbacks(&rcu_sched_ctrlblk); - rcu_process_callbacks(&rcu_bh_ctrlblk); - rcu_preempt_process_callbacks(); - } - schedule_timeout_interruptible(1); /* Leave CPU for others. */ - } - - return 0; /* Not reached, but needed to shut gcc up. */ + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); } /* @@ -280,45 +253,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) __call_rcu(head, func, &rcu_bh_ctrlblk); } EXPORT_SYMBOL_GPL(call_rcu_bh); - -void rcu_barrier_bh(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier_bh); - -void rcu_barrier_sched(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier_sched); - -/* - * Spawn the kthread that invokes RCU callbacks. - */ -static int __init rcu_spawn_kthreads(void) -{ - struct sched_param sp; - - rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); - sp.sched_priority = RCU_BOOST_PRIO; - sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); - return 0; -} -early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index f259c676195f..02aa7139861c 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -26,29 +26,26 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> -#ifdef CONFIG_RCU_TRACE -#define RCU_TRACE(stmt) stmt -#else /* #ifdef CONFIG_RCU_TRACE */ -#define RCU_TRACE(stmt) -#endif /* #else #ifdef CONFIG_RCU_TRACE */ - /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ struct rcu_head **curtail; /* ->next pointer of last CB. */ RCU_TRACE(long qlen); /* Number of pending CBs. */ + RCU_TRACE(char *name); /* Name of RCU type. */ }; /* Definition for rcupdate control block. */ static struct rcu_ctrlblk rcu_sched_ctrlblk = { .donetail = &rcu_sched_ctrlblk.rcucblist, .curtail = &rcu_sched_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_sched") }; static struct rcu_ctrlblk rcu_bh_ctrlblk = { .donetail = &rcu_bh_ctrlblk.rcucblist, .curtail = &rcu_bh_ctrlblk.rcucblist, + RCU_TRACE(.name = "rcu_bh") }; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -131,6 +128,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), + RCU_TRACE(.rcb.name = "rcu_preempt") }; static int rcu_preempted_readers_exp(void); @@ -247,6 +245,13 @@ static void show_tiny_preempt_stats(struct seq_file *m) #include "rtmutex_common.h" +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO + +/* Controls for rcu_kthread() kthread. */ +static struct task_struct *rcu_kthread_task; +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); +static unsigned long have_rcu_kthread_work; + /* * Carry out RCU priority boosting on the task indicated by ->boost_tasks, * and advance ->boost_tasks to the next task in the ->blkd_tasks list. @@ -334,7 +339,7 @@ static int rcu_initiate_boost(void) if (rcu_preempt_ctrlblk.exp_tasks == NULL) rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks; - invoke_rcu_kthread(); + invoke_rcu_callbacks(); } else RCU_TRACE(rcu_initiate_boost_trace()); return 1; @@ -353,14 +358,6 @@ static void rcu_preempt_boost_start_gp(void) #else /* #ifdef CONFIG_RCU_BOOST */ /* - * If there is no RCU priority boosting, we don't boost. - */ -static int rcu_boost(void) -{ - return 0; -} - -/* * If there is no RCU priority boosting, we don't initiate boosting, * but we do indicate whether there are blocked readers blocking the * current grace period. @@ -427,7 +424,7 @@ static void rcu_preempt_cpu_qs(void) /* If there are done callbacks, cause them to be invoked. */ if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); } /* @@ -648,7 +645,7 @@ static void rcu_preempt_check_callbacks(void) rcu_preempt_cpu_qs(); if (&rcu_preempt_ctrlblk.rcb.rcucblist != rcu_preempt_ctrlblk.rcb.donetail) - invoke_rcu_kthread(); + invoke_rcu_callbacks(); if (rcu_preempt_gp_in_progress() && rcu_cpu_blocking_cur_gp() && rcu_preempt_running_reader()) @@ -674,7 +671,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) */ static void rcu_preempt_process_callbacks(void) { - rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); + __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); } /* @@ -697,20 +694,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) } EXPORT_SYMBOL_GPL(call_rcu); -void rcu_barrier(void) -{ - struct rcu_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} -EXPORT_SYMBOL_GPL(rcu_barrier); - /* * synchronize_rcu - wait until a grace period has elapsed. * @@ -864,15 +847,6 @@ static void show_tiny_preempt_stats(struct seq_file *m) #endif /* #ifdef CONFIG_RCU_TRACE */ /* - * Because preemptible RCU does not exist, it is never necessary to - * boost preempted RCU readers. - */ -static int rcu_boost(void) -{ - return 0; -} - -/* * Because preemptible RCU does not exist, it never has any callbacks * to check. */ @@ -898,6 +872,78 @@ static void rcu_preempt_process_callbacks(void) #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST + +/* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_callbacks(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +/* + * This kthread invokes RCU callbacks whose grace periods have + * elapsed. It is awakened as needed, and takes the place of the + * RCU_SOFTIRQ that is used for this purpose when boosting is disabled. + * This is a kthread, but it is never stopped, at least not until + * the system goes down. + */ +static int rcu_kthread(void *arg) +{ + unsigned long work; + unsigned long morework; + unsigned long flags; + + for (;;) { + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); + morework = rcu_boost(); + local_irq_save(flags); + work = have_rcu_kthread_work; + have_rcu_kthread_work = morework; + local_irq_restore(flags); + if (work) + rcu_process_callbacks(NULL); + schedule_timeout_interruptible(1); /* Leave CPU for others. */ + } + + return 0; /* Not reached, but needed to shut gcc up. */ +} + +/* + * Spawn the kthread that invokes RCU callbacks. + */ +static int __init rcu_spawn_kthreads(void) +{ + struct sched_param sp; + + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); + return 0; +} +early_initcall(rcu_spawn_kthreads); + +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Start up softirq processing of callbacks. + */ +void invoke_rcu_callbacks(void) +{ + raise_softirq(RCU_SOFTIRQ); +} + +void rcu_init(void) +{ + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC #include <linux/kernel_stat.h> @@ -913,12 +959,6 @@ void __init rcu_scheduler_starting(void) #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -#ifdef CONFIG_RCU_BOOST -#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO -#else /* #ifdef CONFIG_RCU_BOOST */ -#define RCU_BOOST_PRIO 1 -#endif /* #else #ifdef CONFIG_RCU_BOOST */ - #ifdef CONFIG_RCU_TRACE #ifdef CONFIG_RCU_BOOST diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 98f51b13bb7e..764825c2685c 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -73,7 +73,7 @@ module_param(nreaders, int, 0444); MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); module_param(nfakewriters, int, 0444); MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0444); +module_param(stat_interval, int, 0644); MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); module_param(verbose, bool, 0444); MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); @@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); } -struct rcu_bh_torture_synchronize { - struct rcu_head head; - struct completion completion; -}; - -static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) -{ - struct rcu_bh_torture_synchronize *rcu; - - rcu = container_of(head, struct rcu_bh_torture_synchronize, head); - complete(&rcu->completion); -} - -static void rcu_bh_torture_synchronize(void) -{ - struct rcu_bh_torture_synchronize rcu; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); -} - static struct rcu_torture_ops rcu_bh_ops = { .init = NULL, .cleanup = NULL, @@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, + .sync = synchronize_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_sync_torture_deferred_free, - .sync = rcu_bh_torture_synchronize, + .sync = synchronize_rcu_bh, .cb_barrier = NULL, .fqs = rcu_bh_force_quiescent_state, .stats = NULL, @@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .name = "rcu_bh_sync" }; +static struct rcu_torture_ops rcu_bh_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_bh_expedited, + .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_expedited" +}; + /* * Definitions for srcu torture testing. */ @@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p) call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); } -static void sched_torture_synchronize(void) -{ - synchronize_sched(); -} - static struct rcu_torture_ops sched_ops = { .init = rcu_sync_torture_init, .cleanup = NULL, @@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = { .readunlock = sched_torture_read_unlock, .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, - .sync = sched_torture_synchronize, + .sync = synchronize_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = { .readunlock = sched_torture_read_unlock, .completed = rcu_no_completed, .deferred_free = rcu_sync_torture_deferred_free, - .sync = sched_torture_synchronize, + .sync = synchronize_sched, .cb_barrier = NULL, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg) do { /* Wait for the next test interval. */ oldstarttime = boost_starttime; - while (jiffies - oldstarttime > ULONG_MAX / 2) { + while (ULONG_CMP_LT(jiffies, oldstarttime)) { schedule_timeout_uninterruptible(1); rcu_stutter_wait("rcu_torture_boost"); if (kthread_should_stop() || @@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg) /* Do one boost-test interval. */ endtime = oldstarttime + test_boost_duration * HZ; call_rcu_time = jiffies; - while (jiffies - endtime > ULONG_MAX / 2) { + while (ULONG_CMP_LT(jiffies, endtime)) { /* If we don't have a callback in flight, post one. */ if (!rbi.inflight) { smp_mb(); /* RCU core before ->inflight = 1. */ @@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg) * interval. Besides, we are running at RT priority, * so delays should be relatively rare. */ - while (oldstarttime == boost_starttime) { + while (oldstarttime == boost_starttime && + !kthread_should_stop()) { if (mutex_trylock(&boost_mutex)) { boost_starttime = jiffies + test_boost_interval * HZ; @@ -809,11 +797,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost"); /* Clean up and exit. */ VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); - destroy_rcu_head_on_stack(&rbi.rcu); rcutorture_shutdown_absorb("rcu_torture_boost"); while (!kthread_should_stop() || rbi.inflight) schedule_timeout_uninterruptible(1); smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + destroy_rcu_head_on_stack(&rbi.rcu); return 0; } @@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg) VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); do { fqs_resume_time = jiffies + fqs_stutter * HZ; - while (jiffies - fqs_resume_time > LONG_MAX) { + while (ULONG_CMP_LT(jiffies, fqs_resume_time) && + !kthread_should_stop()) { schedule_timeout_interruptible(1); } fqs_burst_remaining = fqs_duration; - while (fqs_burst_remaining > 0) { + while (fqs_burst_remaining > 0 && + !kthread_should_stop()) { cur_ops->fqs(); udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; @@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu) /* Don't allow time recalculation while creating a new task. */ mutex_lock(&boost_mutex); VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); - boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, - "rcu_torture_boost"); + boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, + cpu_to_node(cpu), + "rcu_torture_boost"); if (IS_ERR(boost_tasks[cpu])) { retval = PTR_ERR(boost_tasks[cpu]); VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); @@ -1424,7 +1415,7 @@ rcu_torture_init(void) int firsterr = 0; static struct rcu_torture_ops *torture_ops[] = { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, &srcu_ops, &srcu_expedited_ops, &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index ba06207b1dd3..e234eb92a177 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -52,13 +52,16 @@ #include <linux/prefetch.h> #include "rcutree.h" +#include <trace/events/rcu.h> + +#include "rcu.h" /* Data structures. */ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; #define RCU_STATE_INITIALIZER(structname) { \ - .level = { &structname.node[0] }, \ + .level = { &structname##_state.node[0] }, \ .levelcnt = { \ NUM_RCU_LVL_0, /* root of hierarchy. */ \ NUM_RCU_LVL_1, \ @@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ - .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ .name = #structname, \ } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); +struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); static struct rcu_state *rcu_state; @@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); -#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ - /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) * Note a quiescent state. Because we do not need to know * how many quiescent states passed, just if there was at least * one since the start of the grace period, this just sets a flag. + * The caller must have disabled preemption. */ void rcu_sched_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; } void rcu_bh_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. + * The caller must have disabled preemption. */ void rcu_note_context_switch(int cpu) { + trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); + trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { }; #endif /* #ifdef CONFIG_NO_HZ */ -static int blimit = 10; /* Maximum callbacks per softirq. */ +static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ @@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp) * trust its state not to change because interrupts are disabled. */ if (cpu_is_offline(rdp->cpu)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); rdp->offline_fqs++; return 1; } @@ -354,19 +364,13 @@ void rcu_enter_nohz(void) local_irq_restore(flags); return; } + trace_rcu_dyntick("Start"); /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ smp_mb__before_atomic_inc(); /* See above. */ atomic_inc(&rdtp->dynticks); smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); local_irq_restore(flags); - - /* If the interrupt queued a callback, get out of dyntick mode. */ - if (in_irq() && - (__get_cpu_var(rcu_sched_data).nxtlist || - __get_cpu_var(rcu_bh_data).nxtlist || - rcu_preempt_needs_cpu(smp_processor_id()))) - set_need_resched(); } /* @@ -391,6 +395,7 @@ void rcu_exit_nohz(void) /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + trace_rcu_dyntick("End"); local_irq_restore(flags); } @@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) */ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) { - unsigned long curr; - unsigned long snap; + unsigned int curr; + unsigned int snap; - curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); - snap = (unsigned long)rdp->dynticks_snap; + curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned int)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { + if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); rdp->dynticks_fqs++; return 1; } @@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) int cpu; long delta; unsigned long flags; + int ndetected; struct rcu_node *rnp = rcu_get_root(rsp); /* Only let one CPU complain about others per time interval. */ @@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * Now rat on any tasks that got kicked up to the root rcu_node * due to CPU offlining. */ - rcu_print_task_stall(rnp); + ndetected = rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); /* @@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) rsp->name); rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave(&rnp->lock, flags); - rcu_print_task_stall(rnp); + ndetected += rcu_print_task_stall(rnp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (rnp->qsmask == 0) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) + if (rnp->qsmask & (1UL << cpu)) { printk(" %d", rnp->grplo + cpu); + ndetected++; + } } printk("} (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); - trigger_all_cpu_backtrace(); + if (ndetected == 0) + printk(KERN_ERR "INFO: Stall ended before state dump start\n"); + else if (!trigger_all_cpu_backtrace()) + dump_stack(); /* If so configured, complain about tasks blocking the grace period. */ @@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp) */ printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", rsp->name, smp_processor_id(), jiffies - rsp->gp_start); - trigger_all_cpu_backtrace(); + if (!trigger_all_cpu_backtrace()) + dump_stack(); raw_spin_lock_irqsave(&rnp->lock, flags); if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) @@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct * go looking for one. */ rdp->gpnum = rnp->gpnum; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); if (rnp->qsmask & rdp->grpmask) { rdp->qs_pending = 1; - rdp->passed_quiesc = 0; + rdp->passed_quiesce = 0; } else rdp->qs_pending = 0; } @@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); /* * If we were in an extended quiescent state, we may have @@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = this_cpu_ptr(rsp->rda); struct rcu_node *rnp = rcu_get_root(rsp); - if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { - if (cpu_needs_another_gp(rsp, rdp)) - rsp->fqs_need_gp = 1; - if (rnp->completed == rsp->completed) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; - } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + if (!rcu_scheduler_fully_active || + !cpu_needs_another_gp(rsp, rdp)) { + /* + * Either the scheduler hasn't yet spawned the first + * non-idle task or this CPU does not need another + * grace period. Either way, don't start a new grace + * period. + */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rsp->fqs_active) { /* - * Propagate new ->completed value to rcu_node structures - * so that other CPUs don't have to wait until the start - * of the next grace period to process their callbacks. + * This CPU needs a grace period, but force_quiescent_state() + * is running. Tell it to start one on this CPU's behalf. */ - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->completed = rsp->completed; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } - local_irq_restore(flags); + rsp->fqs_need_gp = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } /* Advance to a new grace period and initialize state. */ rsp->gpnum++; + trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; @@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); + trace_rcu_grace_period_init(rsp->name, rnp->gpnum, + rnp->level, rnp->grplo, + rnp->grphi, rnp->qsmask); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } @@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { unsigned long gp_duration; + struct rcu_node *rnp = rcu_get_root(rsp); + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); @@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; - rsp->completed = rsp->gpnum; + + /* + * We know the grace period is complete, but to everyone else + * it appears to still be ongoing. But it is also the case + * that to everyone else it looks like there is nothing that + * they can do to advance the grace period. It is therefore + * safe for us to drop the lock in order to mark the grace + * period as completed in all of the rcu_node structures. + * + * But if this CPU needs another grace period, it will take + * care of this while initializing the next grace period. + * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL + * because the callbacks have not yet been advanced: Those + * callbacks are waiting on the grace period that just now + * completed. + */ + if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->gpnum; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + } + + rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ + trace_rcu_grace_period(rsp->name, rsp->completed, "end"); rsp->signaled = RCU_GP_IDLE; rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ } @@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, return; } rnp->qsmask &= ~mask; + trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, + mask, rnp->qsmask, rnp->level, + rnp->grplo, rnp->grphi, + !!rnp->gp_tasks); if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ @@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * based on quiescent states detected in an earlier grace period! */ static void -rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) { unsigned long flags; unsigned long mask; @@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); - if (lastcomp != rnp->completed) { + if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { /* - * Someone beat us to it for this grace period, so leave. - * The race with GP start is resolved by the fact that we - * hold the leaf rcu_node lock, so that the per-CPU bits - * cannot yet be initialized -- so we would simply find our - * CPU's bit already cleared in rcu_report_qs_rnp() if this - * race occurred. + * The grace period in which this quiescent state was + * recorded has ended, so don't report it upwards. + * We will instead need a new quiescent state that lies + * within the current grace period. */ - rdp->passed_quiesc = 0; /* try again later! */ + rdp->passed_quiesce = 0; /* need qs for new gp. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } @@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (!rdp->passed_quiesc) + if (!rdp->passed_quiesce) return; /* * Tell RCU we are done (but rcu_report_qs_rdp() will be the * judge of that). */ - rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); } #ifdef CONFIG_HOTPLUG_CPU @@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + else + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - + !!(rnp->qsmask & mask), + "cpuofl"); break; } - if (rnp == rdp->mynode) + if (rnp == rdp->mynode) { + trace_rcu_grace_period(rsp->name, + rnp->gpnum + 1 - + !!(rnp->qsmask & mask), + "cpuofl"); need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); - else + } else raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; rnp = rnp->parent; @@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; - int count; + int bl, count; /* If no callbacks are ready, just return.*/ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) + if (!cpu_has_callbacks_ready_to_invoke(rdp)) { + trace_rcu_batch_start(rsp->name, 0, 0); + trace_rcu_batch_end(rsp->name, 0); return; + } /* * Extract the list of ready callbacks, disabling to prevent * races with call_rcu() from interrupt handlers. */ local_irq_save(flags); + bl = rdp->blimit; + trace_rcu_batch_start(rsp->name, rdp->qlen, bl); list = rdp->nxtlist; rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; *rdp->nxttail[RCU_DONE_TAIL] = NULL; @@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - __rcu_reclaim(list); + __rcu_reclaim(rsp->name, list); list = next; - if (++count >= rdp->blimit) + if (++count >= bl) break; } local_irq_save(flags); + trace_rcu_batch_end(rsp->name, count); /* Update count, and requeue any remaining callbacks. */ rdp->qlen -= count; @@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_restore(flags); - /* Re-raise the RCU softirq if there are callbacks remaining. */ + /* Re-invoke RCU core processing if there are callbacks remaining. */ if (cpu_has_callbacks_ready_to_invoke(rdp)) invoke_rcu_core(); } @@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Check to see if this CPU is in a non-context-switch quiescent state * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). - * Also schedule the RCU softirq handler. + * Also schedule RCU core processing. * * This function must be called with hardirqs disabled. It is normally * invoked from the scheduling-clock interrupt. If rcu_pending returns @@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { + trace_rcu_utilization("Start scheduler-tick"); if (user || (idle_cpu(cpu) && rcu_scheduler_active && !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { @@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); + trace_rcu_utilization("End scheduler-tick"); } #ifdef CONFIG_SMP @@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - if (!rcu_gp_in_progress(rsp)) + trace_rcu_utilization("Start fqs"); + if (!rcu_gp_in_progress(rsp)) { + trace_rcu_utilization("End fqs"); return; /* No grace period in progress, nothing to force. */ + } if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ + trace_rcu_utilization("End fqs"); return; /* Someone else is already on the job. */ } if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) @@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ rsp->fqs_need_gp = 0; rcu_start_gp(rsp, flags); /* releases rnp->lock */ + trace_rcu_utilization("End fqs"); return; } raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ unlock_fqs_ret: raw_spin_unlock_irqrestore(&rsp->fqslock, flags); + trace_rcu_utilization("End fqs"); } #else /* #ifdef CONFIG_SMP */ @@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) #endif /* #else #ifdef CONFIG_SMP */ /* - * This does the RCU processing work from softirq context for the - * specified rcu_state and rcu_data structures. This may be called - * only from the CPU to whom the rdp belongs. + * This does the RCU core processing work for the specified rcu_state + * and rcu_data structures. This may be called only from the CPU to + * whom the rdp belongs. */ static void __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) @@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * Do softirq processing for the current CPU. + * Do RCU core processing for the current CPU. */ static void rcu_process_callbacks(struct softirq_action *unused) { + trace_rcu_utilization("Start RCU core"); __rcu_process_callbacks(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); rcu_preempt_process_callbacks(); - - /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ - rcu_needs_cpu_flush(); + trace_rcu_utilization("End RCU core"); } /* - * Wake up the current CPU's kthread. This replaces raise_softirq() - * in earlier versions of RCU. Note that because we are running on - * the current CPU with interrupts disabled, the rcu_cpu_kthread_task - * cannot disappear out from under us. + * Schedule RCU callback invocation. If the specified type of RCU + * does not support RCU priority boosting, just do a direct call, + * otherwise wake up the per-CPU kernel kthread. Note that because we + * are running on the current CPU with interrupts disabled, the + * rcu_cpu_kthread_task cannot disappear out from under us. */ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) { @@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rdp->nxttail[RCU_NEXT_TAIL] = &head->next; rdp->qlen++; + if (__is_kfree_rcu_offset((unsigned long)func)) + trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, + rdp->qlen); + else + trace_rcu_callback(rsp->name, head, rdp->qlen); + /* If interrupts were disabled, don't dive into RCU core. */ if (irqs_disabled_flags(flags)) { local_irq_restore(flags); @@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); */ void synchronize_sched(void) { - struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_sched(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu_sched); } EXPORT_SYMBOL_GPL(synchronize_sched); @@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - struct rcu_synchronize rcu; - if (rcu_blocking_is_gp()) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu_bh(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu_bh); } EXPORT_SYMBOL_GPL(synchronize_rcu_bh); @@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending && !rdp->passed_quiesc) { + if (rcu_scheduler_fully_active && + rdp->qs_pending && !rdp->passed_quiesce) { /* * If force_quiescent_state() coming soon and this CPU @@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, jiffies)) set_need_resched(); - } else if (rdp->qs_pending && rdp->passed_quiesc) { + } else if (rdp->qs_pending && rdp->passed_quiesce) { rdp->n_rp_report_qs++; return 1; } @@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); #endif /* #ifdef CONFIG_NO_HZ */ rdp->cpu = cpu; + rdp->rsp = rsp; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); - rdp->passed_quiesc = 0; /* We could be racing with new GP, */ - rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ rdp->preemptible = preemptible; rdp->qlen_last_fqs_check = 0; @@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rnp->qsmaskinit |= mask; mask = rnp->grpmask; if (rnp == rdp->mynode) { - rdp->gpnum = rnp->completed; /* if GP in progress... */ + /* + * If there is a grace period in progress, we will + * set up to wait for it next time we run the + * RCU core code. + */ + rdp->gpnum = rnp->completed; rdp->completed = rnp->completed; - rdp->passed_quiesc_completed = rnp->completed - 1; + rdp->passed_quiesce = 0; + rdp->qs_pending = 0; + rdp->passed_quiesce_gpnum = rnp->gpnum - 1; + trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; @@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; + trace_rcu_utilization("Start CPU hotplug"); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, default: break; } + trace_rcu_utilization("End CPU hotplug"); return NOTIFY_OK; } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 01b2ccda26fb..849ce9ec51fe 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -230,9 +230,9 @@ struct rcu_data { /* in order to detect GP end. */ unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - unsigned long passed_quiesc_completed; - /* Value of completed at time of qs. */ - bool passed_quiesc; /* User-mode/idle loop etc. */ + unsigned long passed_quiesce_gpnum; + /* gpnum at time of quiescent state. */ + bool passed_quiesce; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ bool beenonline; /* CPU online at least once. */ bool preemptible; /* Preemptible RCU? */ @@ -299,6 +299,7 @@ struct rcu_data { unsigned long n_rp_need_nothing; int cpu; + struct rcu_state *rsp; }; /* Values for signaled field in struct rcu_state. */ @@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ +#ifdef CONFIG_RCU_BOOST +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); +#endif /* #ifdef CONFIG_RCU_BOOST */ + #ifndef RCU_TREE_NONCORE /* Forward declarations for rcutree_plugin.h */ @@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, static void rcu_stop_cpu_kthread(int cpu); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ static void rcu_print_detail_task_stall(struct rcu_state *rsp); -static void rcu_print_task_stall(struct rcu_node *rnp); +static int rcu_print_task_stall(struct rcu_node *rnp); static void rcu_preempt_stall_reset(void); static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU @@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_online(void); static void __init __rcu_init_preempt(void); -static void rcu_needs_cpu_flush(void); static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); static void invoke_rcu_callbacks_kthread(void); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 8aafbb80b8b0..4b9b9f8a4184 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -27,6 +27,14 @@ #include <linux/delay.h> #include <linux/stop_machine.h> +#define RCU_KTHREAD_PRIO 1 + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else +#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO +#endif + /* * Check the RCU kernel configuration parameters and print informative * messages about anything out of the ordinary. If you like #ifdef, you @@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); +struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt); DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); static struct rcu_state *rcu_state = &rcu_preempt_state; @@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu) { struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); - rdp->passed_quiesc_completed = rdp->gpnum - 1; + rdp->passed_quiesce_gpnum = rdp->gpnum; barrier(); - rdp->passed_quiesc = 1; + if (rdp->passed_quiesce == 0) + trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); + rdp->passed_quiesce = 1; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } @@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu) if (rnp->qsmask & rdp->grpmask) rnp->gp_tasks = &t->rcu_node_entry; } + trace_rcu_preempt_task(rdp->rsp->name, + t->pid, + (rnp->qsmask & rdp->grpmask) + ? rnp->gpnum + : rnp->gpnum + 1); raw_spin_unlock_irqrestore(&rnp->lock, flags); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special) { @@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) int empty_exp; unsigned long flags; struct list_head *np; +#ifdef CONFIG_RCU_BOOST + struct rt_mutex *rbmp = NULL; +#endif /* #ifdef CONFIG_RCU_BOOST */ struct rcu_node *rnp; int special; @@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); + t->rcu_blocked_node = NULL; + trace_rcu_unlock_preempted_task("rcu_preempt", + rnp->gpnum, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; if (&t->rcu_node_entry == rnp->exp_tasks) @@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t) #ifdef CONFIG_RCU_BOOST if (&t->rcu_node_entry == rnp->boost_tasks) rnp->boost_tasks = np; - /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ - if (t->rcu_boosted) { - special |= RCU_READ_UNLOCK_BOOSTED; - t->rcu_boosted = 0; + /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ + if (t->rcu_boost_mutex) { + rbmp = t->rcu_boost_mutex; + t->rcu_boost_mutex = NULL; } #endif /* #ifdef CONFIG_RCU_BOOST */ - t->rcu_blocked_node = NULL; /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ - if (empty) - raw_spin_unlock_irqrestore(&rnp->lock, flags); - else + if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { + trace_rcu_quiescent_state_report("preempt_rcu", + rnp->gpnum, + 0, rnp->qsmask, + rnp->level, + rnp->grplo, + rnp->grphi, + !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rnp, flags); + } else + raw_spin_unlock_irqrestore(&rnp->lock, flags); #ifdef CONFIG_RCU_BOOST /* Unboost if we were boosted. */ - if (special & RCU_READ_UNLOCK_BOOSTED) { - rt_mutex_unlock(t->rcu_boost_mutex); - t->rcu_boost_mutex = NULL; - } + if (rbmp) + rt_mutex_unlock(rbmp); #endif /* #ifdef CONFIG_RCU_BOOST */ /* @@ -399,10 +424,10 @@ void __rcu_read_unlock(void) { struct task_struct *t = current; - barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ if (t->rcu_read_lock_nesting != 1) --t->rcu_read_lock_nesting; else { + barrier(); /* critical section before exit code. */ t->rcu_read_lock_nesting = INT_MIN; barrier(); /* assign before ->rcu_read_unlock_special load */ if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) @@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ -static void rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp) { struct task_struct *t; + int ndetected = 0; if (!rcu_preempt_blocked_readers_cgp(rnp)) - return; + return 0; t = list_entry(rnp->gp_tasks, struct task_struct, rcu_node_entry); - list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { printk(" P%d", t->pid); + ndetected++; + } + return ndetected; } /* @@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - struct rcu_synchronize rcu; - if (!rcu_scheduler_active) return; - - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - call_rcu(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + wait_rcu_gp(call_rcu); } EXPORT_SYMBOL_GPL(synchronize_rcu); @@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static void rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp) { + return 0; } /* @@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) #endif /* #else #ifdef CONFIG_RCU_TRACE */ +static struct lock_class_key rcu_boost_class; + /* * Carry out RCU priority boosting on the task indicated by ->exp_tasks * or ->boost_tasks, advancing the pointer to the next task in the @@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&mtx, t); + /* Avoid lockdep false positives. This rt_mutex is its own thing. */ + lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class, + "rcu_boost_mutex"); t->rcu_boost_mutex = &mtx; - t->rcu_boosted = 1; raw_spin_unlock_irqrestore(&rnp->lock, flags); rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ @@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg) int spincnt = 0; int more2boost; + trace_rcu_utilization("Start boost kthread@init"); for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + trace_rcu_utilization("End boost kthread@rcu_wait"); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + trace_rcu_utilization("Start boost kthread@rcu_wait"); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg) else spincnt = 0; if (spincnt > 10) { + trace_rcu_utilization("End boost kthread@rcu_yield"); rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + trace_rcu_utilization("Start boost kthread@rcu_yield"); spincnt = 0; } } /* NOTREACHED */ + trace_rcu_utilization("End boost kthread@notreached"); return 0; } @@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void) local_irq_save(flags); __this_cpu_write(rcu_cpu_has_work, 1); - if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { - local_irq_restore(flags); - return; - } - wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && + current != __this_cpu_read(rcu_cpu_kthread_task)) + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); local_irq_restore(flags); } @@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (rnp->boost_kthread_task != NULL) return 0; t = kthread_create(rcu_boost_kthread, (void *)rnp, - "rcub%d", rnp_index); + "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); - sp.sched_priority = RCU_KTHREAD_PRIO; + sp.sched_priority = RCU_BOOST_PRIO; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ return 0; @@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) { struct sched_param sp; struct timer_list yield_timer; + int prio = current->rt_priority; setup_timer_on_stack(&yield_timer, f, arg); mod_timer(&yield_timer, jiffies + 2); @@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg) sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); set_user_nice(current, 19); schedule(); - sp.sched_priority = RCU_KTHREAD_PRIO; + set_user_nice(current, 0); + sp.sched_priority = prio; sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); del_timer(&yield_timer); } @@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu) /* * Per-CPU kernel thread that invokes RCU callbacks. This replaces the - * earlier RCU softirq. + * RCU softirq used in flavors and configurations of RCU that do not + * support RCU priority boosting. */ static int rcu_cpu_kthread(void *arg) { @@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg) char work; char *workp = &per_cpu(rcu_cpu_has_work, cpu); + trace_rcu_utilization("Start CPU kthread@init"); for (;;) { *statusp = RCU_KTHREAD_WAITING; + trace_rcu_utilization("End CPU kthread@rcu_wait"); rcu_wait(*workp != 0 || kthread_should_stop()); + trace_rcu_utilization("Start CPU kthread@rcu_wait"); local_bh_disable(); if (rcu_cpu_kthread_should_stop(cpu)) { local_bh_enable(); @@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg) spincnt = 0; if (spincnt > 10) { *statusp = RCU_KTHREAD_YIELDING; + trace_rcu_utilization("End CPU kthread@rcu_yield"); rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + trace_rcu_utilization("Start CPU kthread@rcu_yield"); spincnt = 0; } } *statusp = RCU_KTHREAD_STOPPED; + trace_rcu_utilization("End CPU kthread@term"); return 0; } @@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) if (!rcu_scheduler_fully_active || per_cpu(rcu_cpu_kthread_task, cpu) != NULL) return 0; - t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + t = kthread_create_on_node(rcu_cpu_kthread, + (void *)(long)cpu, + cpu_to_node(cpu), + "rcuc/%d", cpu); if (IS_ERR(t)) return PTR_ERR(t); if (cpu_online(cpu)) @@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, return 0; if (rnp->node_kthread_task == NULL) { t = kthread_create(rcu_node_kthread, (void *)rnp, - "rcun%d", rnp_index); + "rcun/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); @@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu) return rcu_needs_cpu_quick_check(cpu); } -/* - * Check to see if we need to continue a callback-flush operations to - * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle - * entry is not configured, so we never do need to. - */ -static void rcu_needs_cpu_flush(void) -{ -} - #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ #define RCU_NEEDS_CPU_FLUSHES 5 @@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu) return c; } -/* - * Check to see if we need to continue a callback-flush operations to - * allow the last CPU to enter dyntick-idle mode. - */ -static void rcu_needs_cpu_flush(void) -{ - int cpu = smp_processor_id(); - unsigned long flags; - - if (per_cpu(rcu_dyntick_drain, cpu) <= 0) - return; - local_irq_save(flags); - (void)rcu_needs_cpu(cpu); - local_irq_restore(flags); -} - #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 3b0c0986afc0..9feffa4c0695 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -48,11 +48,6 @@ #ifdef CONFIG_RCU_BOOST -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); -DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); -DECLARE_PER_CPU(char, rcu_cpu_has_work); - static char convert_kthread_status(unsigned int kthread_status) { if (kthread_status > RCU_KTHREAD_MAX) @@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, - rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, " dt=%d/%d/%d df=%lu", @@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, - rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->passed_quiesce, rdp->passed_quiesce_gpnum, rdp->qs_pending); #ifdef CONFIG_NO_HZ seq_printf(m, ",%d,%d,%d,%lu", @@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) static int show_rcudata_csv(struct seq_file *m, void *unused) { - seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); #ifdef CONFIG_NO_HZ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); #endif /* #ifdef CONFIG_NO_HZ */ diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c index 3c7cbc2c33be..a2e7e7210f3e 100644 --- a/kernel/rtmutex-debug.c +++ b/kernel/rtmutex-debug.c @@ -29,61 +29,6 @@ #include "rtmutex_common.h" -# define TRACE_WARN_ON(x) WARN_ON(x) -# define TRACE_BUG_ON(x) BUG_ON(x) - -# define TRACE_OFF() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - if (raw_spin_is_locked(¤t->pi_lock)) \ - raw_spin_unlock(¤t->pi_lock); \ - } \ -} while (0) - -# define TRACE_OFF_NOLOCK() \ -do { \ - if (rt_trace_on) { \ - rt_trace_on = 0; \ - console_verbose(); \ - } \ -} while (0) - -# define TRACE_BUG_LOCKED() \ -do { \ - TRACE_OFF(); \ - BUG(); \ -} while (0) - -# define TRACE_WARN_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) { \ - TRACE_OFF(); \ - WARN_ON(1); \ - } \ -} while (0) - -# define TRACE_BUG_ON_LOCKED(c) \ -do { \ - if (unlikely(c)) \ - TRACE_BUG_LOCKED(); \ -} while (0) - -#ifdef CONFIG_SMP -# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) -#else -# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) -#endif - -/* - * deadlock detection flag. We turn it off when we detect - * the first problem because we dont want to recurse back - * into the tracing code when doing error printk or - * executing a BUG(): - */ -static int rt_trace_on = 1; - static void printk_task(struct task_struct *p) { if (p) @@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner) void rt_mutex_debug_task_free(struct task_struct *task) { - WARN_ON(!plist_head_empty(&task->pi_waiters)); - WARN_ON(task->pi_blocked_on); + DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); + DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } /* @@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, { struct task_struct *task; - if (!rt_trace_on || detect || !act_waiter) + if (!debug_locks || detect || !act_waiter) return; task = rt_mutex_owner(act_waiter->lock); @@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) { struct task_struct *task; - if (!waiter->deadlock_lock || !rt_trace_on) + if (!waiter->deadlock_lock || !debug_locks) return; rcu_read_lock(); @@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - TRACE_OFF_NOLOCK(); + if (!debug_locks_off()) { + rcu_read_unlock(); + return; + } printk("\n============================================\n"); printk( "[ BUG: circular locking deadlock detected! ]\n"); @@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) printk("[ turning off deadlock detection." "Please report this trace. ]\n\n"); - local_irq_disable(); } void debug_rt_mutex_lock(struct rt_mutex *lock) @@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock) void debug_rt_mutex_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); + DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current); } void @@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) { - TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); + DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock)); } void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) @@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { put_pid(waiter->deadlock_task_pid); - TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); - TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); + DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); memset(waiter, 0x22, sizeof(*waiter)); } diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 255e1662acdb..5e8d9cce7470 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct rt_mutex_waiter *waiter) { int ret = 0; + int was_disabled; for (;;) { /* Try to acquire the lock: */ @@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, raw_spin_unlock(&lock->wait_lock); + was_disabled = irqs_disabled(); + if (was_disabled) + local_irq_enable(); + debug_rt_mutex_print_deadlock(waiter); schedule_rt_mutex(lock); + if (was_disabled) + local_irq_disable(); + raw_spin_lock(&lock->wait_lock); set_current_state(state); } diff --git a/kernel/sched.c b/kernel/sched.c index 24637c782002..d87c6e5d4e8c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1895,7 +1895,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be - * successfuly executed on another CPU. We must ensure that updates of + * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); @@ -4346,6 +4346,7 @@ static inline void schedule_debug(struct task_struct *prev) */ if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); + rcu_sleep_check(); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -6101,15 +6102,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) } /* - * In a system that switches off the HZ timer nohz_cpu_mask - * indicates which cpus entered this state. This is used - * in the rcu update to wait only for active cpus. For system - * which do not switch off the HZ timer nohz_cpu_mask should - * always be CPU_BITS_NONE. - */ -cpumask_var_t nohz_cpu_mask; - -/* * Increase the granularity value when there are more CPUs, * because with more CPUs the 'effective latency' as visible * to users decreases. But the relationship is not linear, @@ -8348,8 +8340,6 @@ void __init sched_init(void) */ current->sched_class = &fair_sched_class; - /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ - zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); #ifdef CONFIG_SMP zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); #ifdef CONFIG_NO_HZ @@ -8379,6 +8369,7 @@ void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ + rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || system_state != SYSTEM_RUNNING || oops_in_progress) return; diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 331e01bcd026..87f9e36ea56e 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.utime = cputime_add(cputimer->cputime.utime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.stime = cputime_add(cputimer->cputime.stime, cputime); - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } /** @@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk, if (!cputimer->running) return; - spin_lock(&cputimer->lock); + raw_spin_lock(&cputimer->lock); cputimer->cputime.sum_exec_runtime += ns; - spin_unlock(&cputimer->lock); + raw_spin_unlock(&cputimer->lock); } diff --git a/kernel/semaphore.c b/kernel/semaphore.c index 94a62c0d4ade..d831841e55a7 100644 --- a/kernel/semaphore.c +++ b/kernel/semaphore.c @@ -54,12 +54,12 @@ void down(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else __down(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(down); @@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_interruptible(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_killable(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem) unsigned long flags; int count; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); count = sem->count - 1; if (likely(count >= 0)) sem->count = count; - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return (count < 0); } @@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies) unsigned long flags; int result = 0; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(sem->count > 0)) sem->count--; else result = __down_timeout(sem, jiffies); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); return result; } @@ -179,12 +179,12 @@ void up(struct semaphore *sem) { unsigned long flags; - spin_lock_irqsave(&sem->lock, flags); + raw_spin_lock_irqsave(&sem->lock, flags); if (likely(list_empty(&sem->wait_list))) sem->count++; else __up(sem); - spin_unlock_irqrestore(&sem->lock, flags); + raw_spin_unlock_irqrestore(&sem->lock, flags); } EXPORT_SYMBOL(up); @@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state, if (timeout <= 0) goto timed_out; __set_task_state(task, state); - spin_unlock_irq(&sem->lock); + raw_spin_unlock_irq(&sem->lock); timeout = schedule_timeout(timeout); - spin_lock_irq(&sem->lock); + raw_spin_lock_irq(&sem->lock); if (waiter.up) return 0; } diff --git a/kernel/signal.c b/kernel/signal.c index 291c9700be75..d252be2d3de5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } +static int kill_as_cred_perm(const struct cred *cred, + struct task_struct *target) +{ + const struct cred *pcred = __task_cred(target); + if (cred->user_ns != pcred->user_ns) + return 0; + if (cred->euid != pcred->suid && cred->euid != pcred->uid && + cred->uid != pcred->suid && cred->uid != pcred->uid) + return 0; + return 1; +} + /* like kill_pid_info(), but doesn't use uid/euid of "current" */ -int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, - uid_t uid, uid_t euid, u32 secid) +int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid, + const struct cred *cred, u32 secid) { int ret = -EINVAL; struct task_struct *p; - const struct cred *pcred; unsigned long flags; if (!valid_signal(sig)) @@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, ret = -ESRCH; goto out_unlock; } - pcred = __task_cred(p); - if (si_fromuser(info) && - euid != pcred->suid && euid != pcred->uid && - uid != pcred->suid && uid != pcred->uid) { + if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { ret = -EPERM; goto out_unlock; } @@ -1384,7 +1392,7 @@ out_unlock: rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); /* * kill_something_info() interprets pid in interesting ways just like kill(2). diff --git a/kernel/sys.c b/kernel/sys.c index 18ee1d2f6474..58459509b14c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1172,7 +1172,7 @@ DECLARE_RWSEM(uts_sem); static int override_release(char __user *release, int len) { int ret = 0; - char buf[len]; + char buf[65]; if (current->personality & UNAME26) { char *rest = UTS_RELEASE; @@ -1759,6 +1759,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, sizeof(me->comm) - 1) < 0) return -EFAULT; set_task_comm(me, comm); + proc_comm_connector(me); return 0; case PR_GET_NAME: get_task_comm(comm, me); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index e8bffbe2ba4b..6318b511afa1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = { { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, - { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */ { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index d5097c44b407..eb98e55196b9 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now) struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long flags; - cpumask_clear_cpu(cpu, nohz_cpu_mask); ts->idle_waketime = now; local_irq_save(flags); @@ -389,9 +388,6 @@ void tick_nohz_stop_sched_tick(int inidle) else expires.tv64 = KTIME_MAX; - if (delta_jiffies > 1) - cpumask_set_cpu(cpu, nohz_cpu_mask); - /* Skip reprogram of event if its not changed */ if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) goto out; @@ -441,7 +437,6 @@ void tick_nohz_stop_sched_tick(int inidle) * softirq. */ tick_do_update_jiffies64(ktime_get()); - cpumask_clear_cpu(cpu, nohz_cpu_mask); } raise_softirq_irqoff(TIMER_SOFTIRQ); out: @@ -524,7 +519,6 @@ void tick_nohz_restart_sched_tick(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); - cpumask_clear_cpu(cpu, nohz_cpu_mask); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index a5d0a3a85dd8..0b537f27b559 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -81,7 +81,7 @@ struct entry { /* * Spinlock protecting the tables - not taken during lookup: */ -static DEFINE_SPINLOCK(table_lock); +static DEFINE_RAW_SPINLOCK(table_lock); /* * Per-CPU lookup locks for fast hash lookup: @@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) prev = NULL; curr = *head; - spin_lock(&table_lock); + raw_spin_lock(&table_lock); /* * Make sure we have not raced with another CPU: */ @@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) *head = curr; } out_unlock: - spin_unlock(&table_lock); + raw_spin_unlock(&table_lock); return curr; } diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 761c510a06c5..5f39a07fe5ea 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +CFLAGS_trace_events_filter.o := -I$(src) + # # Make the trace clocks available generally: it's infrastructure # relied on by ptrace for example: @@ -53,6 +55,9 @@ endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_PM_RUNTIME),y) +obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o +endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c3e4575e7829..077d85387908 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3863,6 +3863,14 @@ void ftrace_kill(void) } /** + * Test if ftrace is dead or not. + */ +int ftrace_is_dead(void) +{ + return ftrace_disabled; +} + +/** * register_ftrace_function - register a function for profiling * @ops - ops structure that holds the function for profiling. * diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 731201bf4acc..f5b7b5c1195b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -478,7 +478,7 @@ struct ring_buffer_per_cpu { int cpu; atomic_t record_disabled; struct ring_buffer *buffer; - spinlock_t reader_lock; /* serialize readers */ + raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; struct list_head *pages; @@ -488,12 +488,14 @@ struct ring_buffer_per_cpu { struct buffer_page *reader_page; unsigned long lost_events; unsigned long last_overrun; + local_t entries_bytes; local_t commit_overrun; local_t overrun; local_t entries; local_t committing; local_t commits; unsigned long read; + unsigned long read_bytes; u64 write_stamp; u64 read_stamp; }; @@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) cpu_buffer->cpu = cpu; cpu_buffer->buffer = buffer; - spin_lock_init(&cpu_buffer->reader_lock); + raw_spin_lock_init(&cpu_buffer->reader_lock); lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; @@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } static void @@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, struct list_head *p; unsigned i; - spin_lock_irq(&cpu_buffer->reader_lock); + raw_spin_lock_irq(&cpu_buffer->reader_lock); rb_head_page_deactivate(cpu_buffer); for (i = 0; i < nr_pages; i++) { @@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, rb_check_pages(cpu_buffer); out: - spin_unlock_irq(&cpu_buffer->reader_lock); + raw_spin_unlock_irq(&cpu_buffer->reader_lock); } /** @@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the counters. */ local_add(entries, &cpu_buffer->overrun); + local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes); /* * The entries will be zeroed out when we move the @@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); kmemcheck_annotate_bitfield(event, bitfield); + /* account for padding bytes */ + local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); + /* * Save the original length to the meta data. * This will be used by the reader to add lost event @@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (!tail) tail_page->page->time_stamp = ts; + /* account for these added bytes */ + local_add(length, &cpu_buffer->entries_bytes); + return event; } @@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = local_read(&bpage->write) & ~RB_WRITE_MASK; + unsigned long event_length = rb_event_length(event); /* * This is on the tail page. It is possible that * a write could come in and move the tail page @@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, old_index += write_mask; new_index += write_mask; index = local_cmpxchg(&bpage->write, old_index, new_index); - if (index == old_index) + if (index == old_index) { + /* update counters */ + local_sub(event_length, &cpu_buffer->entries_bytes); return 1; + } } /* could not discard */ @@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) } /** + * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu) +{ + unsigned long flags; + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + /* + * if the tail is on reader_page, oldest time stamp is on the reader + * page + */ + if (cpu_buffer->tail_page == cpu_buffer->reader_page) + bpage = cpu_buffer->reader_page; + else + bpage = rb_set_head_page(cpu_buffer); + ret = bpage->page->time_stamp; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts); + +/** + * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to read from. + */ +unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu); + +/** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer * @cpu: The per CPU buffer to get the entries from. @@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); rb_iter_reset(iter); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); @@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, again: local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event && event->type_len == RINGBUF_TYPE_PADDING) rb_advance_reader(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) @@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) unsigned long flags; again: - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); event = rb_iter_peek(iter, ts); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (event && event->type_len == RINGBUF_TYPE_PADDING) goto again; @@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); event = rb_buffer_peek(cpu_buffer, ts, lost_events); if (event) { @@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, } if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); out: @@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter) cpu_buffer = iter->cpu_buffer; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); arch_spin_lock(&cpu_buffer->lock); rb_iter_reset(iter); arch_spin_unlock(&cpu_buffer->lock); - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); } EXPORT_SYMBOL_GPL(ring_buffer_read_start); @@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; unsigned long flags; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); again: event = rb_iter_peek(iter, ts); if (!event) @@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) rb_advance_iter(iter); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); return event; } @@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->read = 0; local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->entries_bytes, 0); local_set(&cpu_buffer->overrun, 0); local_set(&cpu_buffer->entries, 0); local_set(&cpu_buffer->committing, 0); local_set(&cpu_buffer->commits, 0); cpu_buffer->read = 0; + cpu_buffer->read_bytes = 0; cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) atomic_inc(&cpu_buffer->record_disabled); - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) goto out; @@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) arch_spin_unlock(&cpu_buffer->lock); out: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); atomic_dec(&cpu_buffer->record_disabled); } @@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); if (!ret) @@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) cpu_buffer = buffer->buffers[cpu]; local_irq_save(flags); if (dolock) - spin_lock(&cpu_buffer->reader_lock); + raw_spin_lock(&cpu_buffer->reader_lock); ret = rb_per_cpu_empty(cpu_buffer); if (dolock) - spin_unlock(&cpu_buffer->reader_lock); + raw_spin_unlock(&cpu_buffer->reader_lock); local_irq_restore(flags); return ret; @@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, if (!bpage) goto out; - spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); reader = rb_get_reader_page(cpu_buffer); if (!reader) @@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, } else { /* update the entry counter */ cpu_buffer->read += rb_page_entries(reader); + cpu_buffer->read_bytes += BUF_PAGE_SIZE; /* swap the pages */ rb_init_page(bpage); @@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); out_unlock: - spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); out: return ret; diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c new file mode 100644 index 000000000000..4b3b5eaf94d1 --- /dev/null +++ b/kernel/trace/rpm-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com> + */ + +#include <linux/string.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <linux/sched.h> +#include <linux/module.h> +#include <linux/usb.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/rpm.h> + +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend); +EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e5df02c69b1d..f2bd275bb60f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; static int trace_stop_count; -static DEFINE_SPINLOCK(tracing_start_lock); +static DEFINE_RAW_SPINLOCK(tracing_start_lock); static void wakeup_work_handler(struct work_struct *work) { @@ -435,6 +435,7 @@ static struct { } trace_clocks[] = { { trace_clock_local, "local" }, { trace_clock_global, "global" }, + { trace_clock_counter, "counter" }, }; int trace_clock_id; @@ -960,7 +961,7 @@ void tracing_start(void) if (tracing_disabled) return; - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (--trace_stop_count) { if (trace_stop_count < 0) { /* Someone screwed up their debugging */ @@ -985,7 +986,7 @@ void tracing_start(void) ftrace_start(); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } /** @@ -1000,7 +1001,7 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - spin_lock_irqsave(&tracing_start_lock, flags); + raw_spin_lock_irqsave(&tracing_start_lock, flags); if (trace_stop_count++) goto out; @@ -1018,7 +1019,7 @@ void tracing_stop(void) arch_spin_unlock(&ftrace_max_lock); out: - spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&tracing_start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m) } } +static void test_ftrace_alive(struct seq_file *m) +{ + if (!ftrace_is_dead()) + return; + seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); +} + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v) if (iter->tr) { seq_printf(m, "# tracer: %s\n", iter->trace->name); seq_puts(m, "#\n"); + test_ftrace_alive(m); } if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); @@ -2710,9 +2720,9 @@ static const char readme_msg[] = "# cat /sys/kernel/debug/tracing/trace_options\n" "noprint-parent nosym-offset nosym-addr noverbose\n" "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" ; static ssize_t @@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, } static ssize_t +tracing_total_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + int r, cpu; + unsigned long size = 0, expanded_size = 0; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + size += tr->entries >> 10; + if (!ring_buffer_expanded) + expanded_size += trace_buf_size >> 10; + } + if (ring_buffer_expanded) + r = sprintf(buf, "%lu\n", size); + else + r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t tracing_free_buffer_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp) return 0; } -static int mark_printk(const char *fmt, ...) -{ - int ret; - va_list args; - va_start(args, fmt); - ret = trace_vprintk(0, fmt, args); - va_end(args); - return ret; -} - static ssize_t tracing_mark_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { - char *buf; - size_t written; + unsigned long addr = (unsigned long)ubuf; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + struct page *pages[2]; + int nr_pages = 1; + ssize_t written; + void *page1; + void *page2; + int offset; + int size; + int len; + int ret; if (tracing_disabled) return -EINVAL; @@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, if (cnt > TRACE_BUF_SIZE) cnt = TRACE_BUF_SIZE; - buf = kmalloc(cnt + 2, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; + /* + * Userspace is injecting traces into the kernel trace buffer. + * We want to be as non intrusive as possible. + * To do so, we do not want to allocate any special buffers + * or take any locks, but instead write the userspace data + * straight into the ring buffer. + * + * First we need to pin the userspace buffer into memory, + * which, most likely it is, because it just referenced it. + * But there's no guarantee that it is. By using get_user_pages_fast() + * and kmap_atomic/kunmap_atomic() we can get access to the + * pages directly. We then write the data directly into the + * ring buffer. + */ + BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE); - if (copy_from_user(buf, ubuf, cnt)) { - kfree(buf); - return -EFAULT; + /* check if we cross pages */ + if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK)) + nr_pages = 2; + + offset = addr & (PAGE_SIZE - 1); + addr &= PAGE_MASK; + + ret = get_user_pages_fast(addr, nr_pages, 0, pages); + if (ret < nr_pages) { + while (--ret >= 0) + put_page(pages[ret]); + written = -EFAULT; + goto out; + } + + page1 = kmap_atomic(pages[0]); + if (nr_pages == 2) + page2 = kmap_atomic(pages[1]); + + local_save_flags(irq_flags); + size = sizeof(*entry) + cnt + 2; /* possible \n added */ + buffer = global_trace.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, preempt_count()); + if (!event) { + /* Ring buffer disabled, return as if not open for write */ + written = -EBADF; + goto out_unlock; } - if (buf[cnt-1] != '\n') { - buf[cnt] = '\n'; - buf[cnt+1] = '\0'; + + entry = ring_buffer_event_data(event); + entry->ip = _THIS_IP_; + + if (nr_pages == 2) { + len = PAGE_SIZE - offset; + memcpy(&entry->buf, page1 + offset, len); + memcpy(&entry->buf[len], page2, cnt - len); } else - buf[cnt] = '\0'; + memcpy(&entry->buf, page1 + offset, cnt); - written = mark_printk("%s", buf); - kfree(buf); - *fpos += written; + if (entry->buf[cnt - 1] != '\n') { + entry->buf[cnt] = '\n'; + entry->buf[cnt + 1] = '\0'; + } else + entry->buf[cnt] = '\0'; + + ring_buffer_unlock_commit(buffer, event); - /* don't tell userspace we wrote more - it might confuse them */ - if (written > cnt) - written = cnt; + written = cnt; + *fpos += written; + + out_unlock: + if (nr_pages == 2) + kunmap_atomic(page2); + kunmap_atomic(page1); + while (nr_pages > 0) + put_page(pages[--nr_pages]); + out: return written; } @@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = { .llseek = generic_file_llseek, }; +static const struct file_operations tracing_total_entries_fops = { + .open = tracing_open_generic, + .read = tracing_total_entries_read, + .llseek = generic_file_llseek, +}; + static const struct file_operations tracing_free_buffer_fops = { .write = tracing_free_buffer_write, .release = tracing_free_buffer_release, @@ -3808,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (info->read < PAGE_SIZE) goto read; - info->read = 0; - trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, @@ -3819,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) @@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf, struct trace_array *tr = &global_trace; struct trace_seq *s; unsigned long cnt; + unsigned long long t; + unsigned long usec_rem; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf, cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); + cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + trace_seq_printf(s, "bytes: %ld\n", cnt); + + t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); + + t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + usec_rem = do_div(t, USEC_PER_SEC); + trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); kfree(s); @@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("buffer_size_kb", 0644, d_tracer, &global_trace, &tracing_entries_fops); + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + &global_trace, &tracing_total_entries_fops); + trace_create_file("free_buffer", 0644, d_tracer, &global_trace, &tracing_free_buffer_fops); @@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) tracing_off(); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + if (disable_tracing) ftrace_kill(); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 616846bcfee5..092e1f8d18dc 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task) return test_tsk_trace_trace(task); } +extern int ftrace_is_dead(void); #else static inline int ftrace_trace_task(struct task_struct *task) { return 1; } +static inline int ftrace_is_dead(void) { return 0; } #endif /* @@ -761,16 +763,10 @@ struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; - /* - * Leaf nodes use field_name, ops is used by AND and OR - * nodes. The field_name is always freed when freeing a pred. - * We can overload field_name for ops and have it freed - * as well. - */ - union { - char *field_name; - unsigned short *ops; - }; + unsigned short *ops; +#ifdef CONFIG_FTRACE_STARTUP_TEST + struct ftrace_event_field *field; +#endif int offset; int not; int op; diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 6302747a1398..394783531cbb 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void) return now; } + +static atomic64_t trace_counter; + +/* + * trace_clock_counter(): simply an atomic counter. + * Use the trace_counter "counter" for cases where you do not care + * about timings, but are interested in strict ordering. + */ +u64 notrace trace_clock_counter(void) +{ + return atomic64_add_return(1, &trace_counter); +} diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 256764ecccd6..816d3d074979 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, return pred; } +enum walk_return { + WALK_PRED_ABORT, + WALK_PRED_PARENT, + WALK_PRED_DEFAULT, +}; + +typedef int (*filter_pred_walkcb_t) (enum move_type move, + struct filter_pred *pred, + int *err, void *data); + +static int walk_pred_tree(struct filter_pred *preds, + struct filter_pred *root, + filter_pred_walkcb_t cb, void *data) +{ + struct filter_pred *pred = root; + enum move_type move = MOVE_DOWN; + int done = 0; + + if (!preds) + return -EINVAL; + + do { + int err = 0, ret; + + ret = cb(move, pred, &err, data); + if (ret == WALK_PRED_ABORT) + return err; + if (ret == WALK_PRED_PARENT) + goto get_parent; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + goto get_parent; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + get_parent: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, + &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + /* * A series of AND or ORs where found together. Instead of * climbing up and down the tree branches, an array of the @@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds, for (i = 0; i < op->val; i++) { pred = &preds[op->ops[i]]; - match = pred->fn(pred, rec); + if (!WARN_ON_ONCE(!pred->fn)) + match = pred->fn(pred, rec); if (!!match == type) return match; } return match; } +struct filter_match_preds_data { + struct filter_pred *preds; + int match; + void *rec; +}; + +static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_match_preds_data *d = data; + + *err = 0; + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) + return WALK_PRED_DEFAULT; + /* We can treat folded ops as a leaf node */ + d->match = process_ops(d->preds, pred, d->rec); + } else { + if (!WARN_ON_ONCE(!pred->fn)) + d->match = pred->fn(pred, d->rec); + } + + return WALK_PRED_PARENT; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!d->match == (pred->op == OP_OR)) + return WALK_PRED_PARENT; + break; + case MOVE_UP_FROM_RIGHT: + break; + } + + return WALK_PRED_DEFAULT; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { - int match = -1; - enum move_type move = MOVE_DOWN; struct filter_pred *preds; - struct filter_pred *pred; struct filter_pred *root; - int n_preds; - int done = 0; + struct filter_match_preds_data data = { + /* match is currently meaningless */ + .match = -1, + .rec = rec, + }; + int n_preds, ret; /* no filter is considered a match */ if (!filter) return 1; n_preds = filter->n_preds; - if (!n_preds) return 1; /* * n_preds, root and filter->preds are protect with preemption disabled. */ - preds = rcu_dereference_sched(filter->preds); root = rcu_dereference_sched(filter->root); if (!root) return 1; - pred = root; - - /* match is currently meaningless */ - match = -1; - - do { - switch (move) { - case MOVE_DOWN: - /* only AND and OR have children */ - if (pred->left != FILTER_PRED_INVALID) { - /* If ops is set, then it was folded. */ - if (!pred->ops) { - /* keep going to down the left side */ - pred = &preds[pred->left]; - continue; - } - /* We can treat folded ops as a leaf node */ - match = process_ops(preds, pred, rec); - } else - match = pred->fn(pred, rec); - /* If this pred is the only pred */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - /* - * Check for short circuits. - * - * Optimization: !!match == (pred->op == OP_OR) - * is the same as: - * if ((match && pred->op == OP_OR) || - * (!match && pred->op == OP_AND)) - */ - if (!!match == (pred->op == OP_OR)) { - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - /* now go down the right side of the tree. */ - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - /* We finished this equation. */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return match; + data.preds = preds = rcu_dereference_sched(filter->preds); + ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data); + WARN_ON(ret); + return data.match; } EXPORT_SYMBOL_GPL(filter_match_preds); @@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name) return __find_event_field(head, name); } -static void filter_free_pred(struct filter_pred *pred) -{ - if (!pred) - return; - - kfree(pred->field_name); - kfree(pred); -} - -static void filter_clear_pred(struct filter_pred *pred) -{ - kfree(pred->field_name); - pred->field_name = NULL; - pred->regex.len = 0; -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); @@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack) static int filter_set_pred(struct event_filter *filter, int idx, struct pred_stack *stack, - struct filter_pred *src, - filter_pred_fn_t fn) + struct filter_pred *src) { struct filter_pred *dest = &filter->preds[idx]; struct filter_pred *left; struct filter_pred *right; *dest = *src; - if (src->field_name) { - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; - } - dest->fn = fn; dest->index = idx; if (dest->op == OP_OR || dest->op == OP_AND) { @@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter, static void __free_preds(struct event_filter *filter) { - int i; - if (filter->preds) { - for (i = 0; i < filter->a_preds; i++) - kfree(filter->preds[i].field_name); kfree(filter->preds); filter->preds = NULL; } @@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system) } } -static int filter_add_pred_fn(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - filter_pred_fn_t fn) +static int filter_add_pred(struct filter_parse_state *ps, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack) { - int idx, err; + int err; if (WARN_ON(filter->n_preds == filter->a_preds)) { parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } - idx = filter->n_preds; - filter_clear_pred(&filter->preds[idx]); - err = filter_set_pred(filter, idx, stack, pred, fn); + err = filter_set_pred(filter, filter->n_preds, stack, pred); if (err) return err; @@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size, return fn; } -static int filter_add_pred(struct filter_parse_state *ps, - struct ftrace_event_call *call, - struct event_filter *filter, - struct filter_pred *pred, - struct pred_stack *stack, - bool dry_run) +static int init_pred(struct filter_parse_state *ps, + struct ftrace_event_field *field, + struct filter_pred *pred) + { - struct ftrace_event_field *field; - filter_pred_fn_t fn; + filter_pred_fn_t fn = filter_pred_none; unsigned long long val; int ret; - fn = pred->fn = filter_pred_none; - - if (pred->op == OP_AND) - goto add_pred_fn; - else if (pred->op == OP_OR) - goto add_pred_fn; - - field = find_event_field(call, pred->field_name); - if (!field) { - parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); - return -EINVAL; - } - pred->offset = field->offset; if (!is_legal_op(field, pred->op)) { @@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps, if (pred->op == OP_NE) pred->not = 1; -add_pred_fn: - if (!dry_run) - return filter_add_pred_fn(ps, call, filter, pred, stack, fn); + pred->fn = fn; return 0; } @@ -1302,39 +1302,37 @@ parse_operand: return 0; } -static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +static struct filter_pred *create_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + int op, char *operand1, char *operand2) { - struct filter_pred *pred; + struct ftrace_event_field *field; + static struct filter_pred pred; - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) - return NULL; + memset(&pred, 0, sizeof(pred)); + pred.op = op; - pred->field_name = kstrdup(operand1, GFP_KERNEL); - if (!pred->field_name) { - kfree(pred); + if (op == OP_AND || op == OP_OR) + return &pred; + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); return NULL; } - strcpy(pred->regex.pattern, operand2); - pred->regex.len = strlen(pred->regex.pattern); - - pred->op = op; - - return pred; -} - -static struct filter_pred *create_logical_pred(int op) -{ - struct filter_pred *pred; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + field = find_event_field(call, operand1); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; + } - pred->op = op; + strcpy(pred.regex.pattern, operand2); + pred.regex.len = strlen(pred.regex.pattern); - return pred; +#ifdef CONFIG_FTRACE_STARTUP_TEST + pred.field = field; +#endif + return init_pred(ps, field, &pred) ? NULL : &pred; } static int check_preds(struct filter_parse_state *ps) @@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps) return n_preds; } +struct check_pred_data { + int count; + int max; +}; + +static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct check_pred_data *d = data; + + if (WARN_ON(d->count++ > d->max)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + return WALK_PRED_DEFAULT; +} + /* * The tree is walked at filtering of an event. If the tree is not correctly * built, it may cause an infinite loop. Check here that the tree does @@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps) static int check_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; - int max; - - /* - * The max that we can hit a node is three times. - * Once going down, once coming up from left, and - * once coming up from right. This is more than enough - * since leafs are only hit a single time. - */ - max = 3 * filter->n_preds; + struct check_pred_data data = { + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + .max = 3 * filter->n_preds, + .count = 0, + }; - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; + return walk_pred_tree(filter->preds, root, + check_pred_tree_cb, &data); +} - do { - if (WARN_ON(count++ > max)) - return -EINVAL; +static int count_leafs_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + int *count = data; - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) + (*count)++; - /* We are fine. */ - return 0; + return WALK_PRED_DEFAULT; } static int count_leafs(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; - int done = 0; + int count = 0, ret; - pred = root; + ret = walk_pred_tree(preds, root, count_leafs_cb, &count); + WARN_ON(ret); + return count; +} - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - return 1; - count++; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); +struct fold_pred_data { + struct filter_pred *root; + int count; + int children; +}; - return count; +static int fold_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct fold_pred_data *d = data; + struct filter_pred *root = d->root; + + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (pred->left != FILTER_PRED_INVALID) + return WALK_PRED_DEFAULT; + + if (WARN_ON(d->count == d->children)) { + *err = -EINVAL; + return WALK_PRED_ABORT; + } + + pred->index &= ~FILTER_PRED_FOLD; + root->ops[d->count++] = pred->index; + return WALK_PRED_DEFAULT; } static int fold_pred(struct filter_pred *preds, struct filter_pred *root) { - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int count = 0; + struct fold_pred_data data = { + .root = root, + .count = 0, + }; int children; - int done = 0; /* No need to keep the fold flag */ root->index &= ~FILTER_PRED_FOLD; @@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) return -ENOMEM; root->val = children; + data.children = children; + return walk_pred_tree(preds, root, fold_pred_cb, &data); +} - pred = root; - do { - switch (move) { - case MOVE_DOWN: - if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - if (WARN_ON(count == children)) - return -EINVAL; - pred->index &= ~FILTER_PRED_FOLD; - root->ops[count++] = pred->index; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); +static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + struct filter_pred *preds = data; - return 0; + if (move != MOVE_DOWN) + return WALK_PRED_DEFAULT; + if (!(pred->index & FILTER_PRED_FOLD)) + return WALK_PRED_DEFAULT; + + *err = fold_pred(preds, pred); + if (*err) + return WALK_PRED_ABORT; + + /* eveyrhing below is folded, continue with parent */ + return WALK_PRED_PARENT; } /* @@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root) static int fold_pred_tree(struct event_filter *filter, struct filter_pred *root) { - struct filter_pred *preds; - struct filter_pred *pred; - enum move_type move = MOVE_DOWN; - int done = 0; - int err; - - preds = filter->preds; - if (!preds) - return -EINVAL; - pred = root; - - do { - switch (move) { - case MOVE_DOWN: - if (pred->index & FILTER_PRED_FOLD) { - err = fold_pred(preds, pred); - if (err) - return err; - /* Folded nodes are like leafs */ - } else if (pred->left != FILTER_PRED_INVALID) { - pred = &preds[pred->left]; - continue; - } - - /* A leaf at the root is just a leaf in the tree */ - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - case MOVE_UP_FROM_LEFT: - pred = &preds[pred->right]; - move = MOVE_DOWN; - continue; - case MOVE_UP_FROM_RIGHT: - if (pred == root) - break; - pred = get_pred_parent(pred, preds, - pred->parent, &move); - continue; - } - done = 1; - } while (!done); - - return 0; + return walk_pred_tree(filter->preds, root, fold_pred_tree_cb, + filter->preds); } static int replace_preds(struct ftrace_event_call *call, @@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call, goto fail; } - if (elt->op == OP_AND || elt->op == OP_OR) { - pred = create_logical_pred(elt->op); - goto add_pred; - } - - if (!operand1 || !operand2) { - parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + pred = create_pred(ps, call, elt->op, operand1, operand2); + if (!pred) { err = -EINVAL; goto fail; } - pred = create_pred(elt->op, operand1, operand2); -add_pred: - if (!pred) { - err = -ENOMEM; - goto fail; + if (!dry_run) { + err = filter_add_pred(ps, filter, pred, &stack); + if (err) + goto fail; } - err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); - filter_free_pred(pred); - if (err) - goto fail; operand1 = operand2 = NULL; } @@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, int err; struct event_filter *filter; struct filter_parse_state *ps; - struct ftrace_event_call *call = NULL; + struct ftrace_event_call *call; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (call->event.type == event_id) - break; - } + call = event->tp_event; err = -EINVAL; - if (&call->list == &ftrace_events) + if (!call) goto out_unlock; err = -EEXIST; @@ -2012,3 +1929,215 @@ out_unlock: #endif /* CONFIG_PERF_EVENTS */ +#ifdef CONFIG_FTRACE_STARTUP_TEST + +#include <linux/types.h> +#include <linux/tracepoint.h> + +#define CREATE_TRACE_POINTS +#include "trace_events_filter_test.h" + +static int test_get_filter(char *filter_str, struct ftrace_event_call *call, + struct event_filter **pfilter) +{ + struct event_filter *filter; + struct filter_parse_state *ps; + int err = -ENOMEM; + + filter = __alloc_filter(); + if (!filter) + goto out; + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_filter; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + *pfilter = filter; + + free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + free_filter: + if (err) + __free_filter(filter); + + out: + return err; +} + +#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \ +{ \ + .filter = FILTER, \ + .rec = { .a = va, .b = vb, .c = vc, .d = vd, \ + .e = ve, .f = vf, .g = vg, .h = vh }, \ + .match = m, \ + .not_visited = nvisit, \ +} +#define YES 1 +#define NO 0 + +static struct test_filter_data_t { + char *filter; + struct ftrace_raw_ftrace_test_filter rec; + int match; + char *not_visited; +} test_filter_data[] = { +#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \ + "e == 1 && f == 1 && g == 1 && h == 1" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""), + DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"), + DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""), +#undef FILTER +#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \ + "e == 1 || f == 1 || g == 1 || h == 1" + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"), +#undef FILTER +#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \ + "(e == 1 || f == 1) && (g == 1 || h == 1)" + DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"), + DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"), +#undef FILTER +#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"), + DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), +#undef FILTER +#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \ + "(e == 1 && f == 1) || (g == 1 && h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""), +#undef FILTER +#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \ + "(e == 1 || f == 1)) && (g == 1 || h == 1)" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"), + DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""), + DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"), +#undef FILTER +#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \ + "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"), + DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""), +#undef FILTER +#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \ + "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))" + DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"), + DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""), + DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"), +}; + +#undef DATA_REC +#undef FILTER +#undef YES +#undef NO + +#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t)) + +static int test_pred_visited; + +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + struct ftrace_event_field *field = pred->field; + + test_pred_visited = 1; + printk(KERN_INFO "\npred visited %s\n", field->name); + return 1; +} + +static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred, + int *err, void *data) +{ + char *fields = data; + + if ((move == MOVE_DOWN) && + (pred->left == FILTER_PRED_INVALID)) { + struct ftrace_event_field *field = pred->field; + + if (!field) { + WARN(1, "all leafs should have field defined"); + return WALK_PRED_DEFAULT; + } + if (!strchr(fields, *field->name)) + return WALK_PRED_DEFAULT; + + WARN_ON(!pred->fn); + pred->fn = test_pred_visited_fn; + } + return WALK_PRED_DEFAULT; +} + +static __init int ftrace_test_event_filter(void) +{ + int i; + + printk(KERN_INFO "Testing ftrace filter: "); + + for (i = 0; i < DATA_CNT; i++) { + struct event_filter *filter = NULL; + struct test_filter_data_t *d = &test_filter_data[i]; + int err; + + err = test_get_filter(d->filter, &event_ftrace_test_filter, + &filter); + if (err) { + printk(KERN_INFO + "Failed to get filter for '%s', err %d\n", + d->filter, err); + break; + } + + /* + * The preemption disabling is not really needed for self + * tests, but the rcu dereference will complain without it. + */ + preempt_disable(); + if (*d->not_visited) + walk_pred_tree(filter->preds, filter->root, + test_walk_pred_cb, + d->not_visited); + + test_pred_visited = 0; + err = filter_match_preds(filter, &d->rec); + preempt_enable(); + + __free_filter(filter); + + if (test_pred_visited) { + printk(KERN_INFO + "Failed, unwanted pred visited for filter %s\n", + d->filter); + break; + } + + if (err != d->match) { + printk(KERN_INFO + "Failed to match filter '%s', expected %d\n", + d->filter, d->match); + break; + } + } + + if (i == DATA_CNT) + printk(KERN_CONT "OK\n"); + + return 0; +} + +late_initcall(ftrace_test_event_filter); + +#endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h new file mode 100644 index 000000000000..bfd4dba0d603 --- /dev/null +++ b/kernel/trace/trace_events_filter_test.h @@ -0,0 +1,50 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM test + +#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TEST_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(ftrace_test_filter, + + TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h), + + TP_ARGS(a, b, c, d, e, f, g, h), + + TP_STRUCT__entry( + __field(int, a) + __field(int, b) + __field(int, c) + __field(int, d) + __field(int, e) + __field(int, f) + __field(int, g) + __field(int, h) + ), + + TP_fast_assign( + __entry->a = a; + __entry->b = b; + __entry->c = c; + __entry->d = d; + __entry->e = e; + __entry->f = f; + __entry->g = g; + __entry->h = h; + ), + + TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d", + __entry->a, __entry->b, __entry->c, __entry->d, + __entry->e, __entry->f, __entry->g, __entry->h) +); + +#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_events_filter_test + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 667aa8cc0cfc..20dad0d7a163 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly; static DEFINE_PER_CPU(int, tracing_cpu); -static DEFINE_SPINLOCK(max_trace_lock); +static DEFINE_RAW_SPINLOCK(max_trace_lock); enum { TRACER_IRQS_OFF = (1 << 1), @@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr, if (!report_latency(delta)) goto out; - spin_lock_irqsave(&max_trace_lock, flags); + raw_spin_lock_irqsave(&max_trace_lock, flags); /* check if we are still the max latency */ if (!report_latency(delta)) @@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr, max_sequence++; out_unlock: - spin_unlock_irqrestore(&max_trace_lock, flags); + raw_spin_unlock_irqrestore(&max_trace_lock, flags); out: data->critical_sequence = max_sequence; @@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller); #ifdef CONFIG_PREEMPT_TRACER void trace_preempt_on(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - if (preempt_trace()) + if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); } #endif /* CONFIG_PREEMPT_TRACER */ diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5fb3697bf0e5..00d527c945a4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp) } /* Unregister a trace_probe and probe_event: call with locking probe_lock */ -static void unregister_trace_probe(struct trace_probe *tp) +static int unregister_trace_probe(struct trace_probe *tp) { + /* Enabled event can not be unregistered */ + if (trace_probe_is_enabled(tp)) + return -EBUSY; + __unregister_trace_probe(tp); list_del(&tp->list); unregister_probe_event(tp); + + return 0; } /* Register a trace_probe and probe_event */ @@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp) /* Delete old (same name) event if exist */ old_tp = find_trace_probe(tp->call.name, tp->call.class->system); if (old_tp) { - unregister_trace_probe(old_tp); + ret = unregister_trace_probe(old_tp); + if (ret < 0) + goto end; free_trace_probe(old_tp); } @@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb, mutex_lock(&probe_lock); list_for_each_entry(tp, &probe_list, list) { if (trace_probe_within_module(tp, mod)) { + /* Don't need to check busy - this should have gone. */ __unregister_trace_probe(tp); ret = __register_trace_probe(tp); if (ret) @@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv) return -ENOENT; } /* delete an event */ - unregister_trace_probe(tp); - free_trace_probe(tp); + ret = unregister_trace_probe(tp); + if (ret == 0) + free_trace_probe(tp); mutex_unlock(&probe_lock); - return 0; + return ret; } if (argc < 2) { @@ -1317,18 +1327,29 @@ error: return ret; } -static void release_all_trace_probes(void) +static int release_all_trace_probes(void) { struct trace_probe *tp; + int ret = 0; mutex_lock(&probe_lock); + /* Ensure no probe is in use. */ + list_for_each_entry(tp, &probe_list, list) + if (trace_probe_is_enabled(tp)) { + ret = -EBUSY; + goto end; + } /* TODO: Use batch unregistration */ while (!list_empty(&probe_list)) { tp = list_entry(probe_list.next, struct trace_probe, list); unregister_trace_probe(tp); free_trace_probe(tp); } + +end: mutex_unlock(&probe_lock); + + return ret; } /* Probes listing interfaces */ @@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = { static int probes_open(struct inode *inode, struct file *file) { - if ((file->f_mode & FMODE_WRITE) && - (file->f_flags & O_TRUNC)) - release_all_trace_probes(); + int ret; + + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ret = release_all_trace_probes(); + if (ret < 0) + return ret; + } return seq_open(file, &probes_seq_op); } @@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void) ret = target(1, 2, 3, 4, 5, 6); + /* Disable trace points before removing it */ + tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting test probe.\n"); + warn++; + } else + disable_trace_probe(tp, TP_FLAG_TRACE); + + tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting 2nd test probe.\n"); + warn++; + } else + disable_trace_probe(tp, TP_FLAG_TRACE); + ret = command_trace_probe("-:testprobe"); if (WARN_ON_ONCE(ret)) { pr_warning("error on deleting a probe.\n"); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1f06468a10d7..6fd4ffd042f9 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) continue; } + fmt = NULL; tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); - if (tb_fmt) + if (tb_fmt) { fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); - if (tb_fmt && fmt) { - list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); - strcpy(fmt, *iter); - tb_fmt->fmt = fmt; - *iter = tb_fmt->fmt; - } else { - kfree(tb_fmt); - *iter = NULL; + if (fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + } else + kfree(tb_fmt); } + *iter = fmt; + } mutex_unlock(&btrace_mutex); } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index b219f1449c54..db110b8ae030 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[]; static const int tracepoint_debug; /* - * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the - * builtin and module tracepoints and the hash table. + * Tracepoints mutex protects the builtin and module tracepoints and the hash + * table, as well as the local module list. */ static DEFINE_MUTEX(tracepoints_mutex); +#ifdef CONFIG_MODULES +/* Local list of struct module */ +static LIST_HEAD(tracepoint_module_list); +#endif /* CONFIG_MODULES */ + /* * Tracepoint hash table, containing the active tracepoints. * Protected by tracepoints_mutex. @@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem) * @end: end of the range * * Updates the probe callback corresponding to a range of tracepoints. + * Called with tracepoints_mutex held. */ -void tracepoint_update_probe_range(struct tracepoint * const *begin, - struct tracepoint * const *end) +static void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) { struct tracepoint * const *iter; struct tracepoint_entry *mark_entry; @@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, if (!begin) return; - mutex_lock(&tracepoints_mutex); for (iter = begin; iter < end; iter++) { mark_entry = get_tracepoint((*iter)->name); if (mark_entry) { @@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin, disable_tracepoint(*iter); } } - mutex_unlock(&tracepoints_mutex); } +#ifdef CONFIG_MODULES +void module_update_tracepoints(void) +{ + struct tp_module *tp_mod; + + list_for_each_entry(tp_mod, &tracepoint_module_list, list) + tracepoint_update_probe_range(tp_mod->tracepoints_ptrs, + tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints); +} +#else /* CONFIG_MODULES */ +void module_update_tracepoints(void) +{ +} +#endif /* CONFIG_MODULES */ + + /* * Update probes, removing the faulty probes. + * Called with tracepoints_mutex held. */ static void tracepoint_update_probes(void) { @@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_add_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data) mutex_lock(&tracepoints_mutex); old = tracepoint_remove_probe(name, probe, data); - mutex_unlock(&tracepoints_mutex); - if (IS_ERR(old)) + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); - + } tracepoint_update_probes(); /* may update entry */ + mutex_unlock(&tracepoints_mutex); release_probes(old); return 0; } @@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void) if (!list_empty(&old_probes)) list_replace_init(&old_probes, &release_probes); need_update = 0; - mutex_unlock(&tracepoints_mutex); - tracepoint_update_probes(); + mutex_unlock(&tracepoints_mutex); list_for_each_entry_safe(pos, next, &release_probes, u.list) { list_del(&pos->u.list); call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); @@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); * Will return the first tracepoint in the range if the input tracepoint is * NULL. */ -int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, +static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, struct tracepoint * const *begin, struct tracepoint * const *end) { if (!*tracepoint && begin != end) { @@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, return 1; return 0; } -EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); +#ifdef CONFIG_MODULES static void tracepoint_get_iter(struct tracepoint_iter *iter) { int found = 0; + struct tp_module *iter_mod; /* Core kernel tracepoints */ if (!iter->module) { @@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter) if (found) goto end; } - /* tracepoints in modules. */ - found = module_get_iter_tracepoints(iter); + /* Tracepoints in modules */ + mutex_lock(&tracepoints_mutex); + list_for_each_entry(iter_mod, &tracepoint_module_list, list) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + mutex_unlock(&tracepoints_mutex); end: if (!found) tracepoint_iter_reset(iter); } +#else /* CONFIG_MODULES */ +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); + if (!found) + tracepoint_iter_reset(iter); +} +#endif /* CONFIG_MODULES */ void tracepoint_iter_start(struct tracepoint_iter *iter) { @@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop); void tracepoint_iter_reset(struct tracepoint_iter *iter) { +#ifdef CONFIG_MODULES iter->module = NULL; +#endif /* CONFIG_MODULES */ iter->tracepoint = NULL; } EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +static int tracepoint_module_coming(struct module *mod) +{ + struct tp_module *tp_mod, *iter; + int ret = 0; + + /* + * We skip modules that tain the kernel, especially those with different + * module header (for forced load), to make sure we don't cause a crash. + */ + if (mod->taints) + return 0; + mutex_lock(&tracepoints_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); + if (!tp_mod) { + ret = -ENOMEM; + goto end; + } + tp_mod->num_tracepoints = mod->num_tracepoints; + tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs; + + /* + * tracepoint_module_list is kept sorted by struct module pointer + * address for iteration on tracepoints from a seq_file that can release + * the mutex between calls. + */ + list_for_each_entry_reverse(iter, &tracepoint_module_list, list) { + BUG_ON(iter == tp_mod); /* Should never be in the list twice */ + if (iter < tp_mod) { + /* We belong to the location right after iter. */ + list_add(&tp_mod->list, &iter->list); + goto module_added; + } + } + /* We belong to the beginning of the list */ + list_add(&tp_mod->list, &tracepoint_module_list); +module_added: + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); +end: + mutex_unlock(&tracepoints_mutex); + return ret; +} + +static int tracepoint_module_going(struct module *mod) +{ + struct tp_module *pos; + + mutex_lock(&tracepoints_mutex); + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); + list_for_each_entry(pos, &tracepoint_module_list, list) { + if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) { + list_del(&pos->list); + kfree(pos); + break; + } + } + /* + * In the case of modules that were tainted at "coming", we'll simply + * walk through the list without finding it. We cannot use the "tainted" + * flag on "going", in case a module taints the kernel only after being + * loaded. + */ + mutex_unlock(&tracepoints_mutex); + return 0; +} int tracepoint_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; + int ret = 0; switch (val) { case MODULE_STATE_COMING: + ret = tracepoint_module_coming(mod); + break; + case MODULE_STATE_LIVE: + break; case MODULE_STATE_GOING: - tracepoint_update_probe_range(mod->tracepoints_ptrs, - mod->tracepoints_ptrs + mod->num_tracepoints); + ret = tracepoint_module_going(mod); break; } - return 0; + return ret; } struct notifier_block tracepoint_module_nb = { @@ -598,7 +724,6 @@ static int init_tracepoints(void) return register_module_notifier(&tracepoint_module_nb); } __initcall(init_tracepoints); - #endif /* CONFIG_MODULES */ #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 36491cd5b7d4..d680381b0e9c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) */ static int watchdog(void *unused) { - static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); sched_setscheduler(current, SCHED_FIFO, ¶m); @@ -350,7 +350,8 @@ static int watchdog(void *unused) set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); - + param.sched_priority = 0; + sched_setscheduler(current, SCHED_NORMAL, ¶m); return 0; } @@ -438,7 +439,7 @@ static int watchdog_enable(int cpu) /* create the watchdog thread */ if (!p) { - p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); if (!err) { |