summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/async.c4
-rw-r--r--kernel/cgroup.c18
-rw-r--r--kernel/cred.c18
-rw-r--r--kernel/events/core.c101
-rw-r--r--kernel/freezer.c2
-rw-r--r--kernel/futex.c10
-rw-r--r--kernel/irq/chip.c64
-rw-r--r--kernel/irq/internals.h19
-rw-r--r--kernel/irq/irqdesc.c32
-rw-r--r--kernel/irq/manage.c218
-rw-r--r--kernel/irq/pm.c48
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/kmod.c4
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/latencytop.c14
-rw-r--r--kernel/lockdep.c240
-rw-r--r--kernel/module.c47
-rw-r--r--kernel/params.c21
-rw-r--r--kernel/pid.c4
-rw-r--r--kernel/posix-cpu-timers.c17
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/Makefile4
-rw-r--r--kernel/power/console.c4
-rw-r--r--kernel/power/hibernate.c53
-rw-r--r--kernel/power/main.c102
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/process.c30
-rw-r--r--kernel/power/qos.c (renamed from kernel/pm_qos_params.c)273
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/suspend.c17
-rw-r--r--kernel/power/swap.c818
-rw-r--r--kernel/printk.c46
-rw-r--r--kernel/rcu.h85
-rw-r--r--kernel/rcupdate.c26
-rw-r--r--kernel/rcutiny.c117
-rw-r--r--kernel/rcutiny_plugin.h134
-rw-r--r--kernel/rcutorture.c77
-rw-r--r--kernel/rcutree.c290
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h150
-rw-r--r--kernel/rcutree_trace.c13
-rw-r--r--kernel/rtmutex-debug.c77
-rw-r--r--kernel/rtmutex.c8
-rw-r--r--kernel/sched.c15
-rw-r--r--kernel/sched_stats.h12
-rw-r--r--kernel/semaphore.c28
-rw-r--r--kernel/signal.c24
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/time/timer_stats.c6
-rw-r--r--kernel/trace/Makefile5
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c122
-rw-r--r--kernel/trace/rpm-traces.c20
-rw-r--r--kernel/trace/trace.c191
-rw-r--r--kernel/trace/trace.h16
-rw-r--r--kernel/trace/trace_clock.c12
-rw-r--r--kernel/trace/trace_events_filter.c795
-rw-r--r--kernel/trace/trace_events_filter_test.h50
-rw-r--r--kernel/trace/trace_irqsoff.c10
-rw-r--r--kernel/trace/trace_kprobe.c58
-rw-r--r--kernel/trace/trace_printk.c19
-rw-r--r--kernel/tracepoint.c169
-rw-r--r--kernel/watchdog.c7
66 files changed, 3381 insertions, 1488 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eca595e2fd52..2da48d3515eb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
- notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+ notifier.o ksysfs.o sched_clock.o cred.o \
async.o range.o
obj-y += groups.o
diff --git a/kernel/async.c b/kernel/async.c
index d5fe7af0de2e..4c2843c0043e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work)
struct async_entry *entry =
container_of(work, struct async_entry, work);
unsigned long flags;
- ktime_t calltime, delta, rettime;
+ ktime_t uninitialized_var(calltime), delta, rettime;
/* 1) move self to the running queue */
spin_lock_irqsave(&async_lock, flags);
@@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
void async_synchronize_cookie_domain(async_cookie_t cookie,
struct list_head *running)
{
- ktime_t starttime, delta, endtime;
+ ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95d..453100a4159d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list)
/* the list of cgroups eligible for automatic release. Protected by
* release_list_lock */
static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
@@ -4014,11 +4014,11 @@ again:
finish_wait(&cgroup_rmdir_waitq, &wait);
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
list_del_init(&cgrp->release_list);
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
@@ -4671,13 +4671,13 @@ static void check_for_release(struct cgroup *cgrp)
* already queued for a userspace notification, queue
* it now */
int need_schedule_work = 0;
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
if (!cgroup_is_removed(cgrp) &&
list_empty(&cgrp->release_list)) {
list_add(&cgrp->release_list, &release_list);
need_schedule_work = 1;
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
if (need_schedule_work)
schedule_work(&release_agent_work);
}
@@ -4729,7 +4729,7 @@ static void cgroup_release_agent(struct work_struct *work)
{
BUG_ON(work != &release_agent_work);
mutex_lock(&cgroup_mutex);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
while (!list_empty(&release_list)) {
char *argv[3], *envp[3];
int i;
@@ -4738,7 +4738,7 @@ static void cgroup_release_agent(struct work_struct *work)
struct cgroup,
release_list);
list_del_init(&cgrp->release_list);
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!pathbuf)
goto continue_free;
@@ -4768,9 +4768,9 @@ static void cgroup_release_agent(struct work_struct *work)
continue_free:
kfree(pathbuf);
kfree(agentbuf);
- spin_lock(&release_list_lock);
+ raw_spin_lock(&release_list_lock);
}
- spin_unlock(&release_list_lock);
+ raw_spin_unlock(&release_list_lock);
mutex_unlock(&cgroup_mutex);
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 8ef31f53c44c..bb55d052d858 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -644,6 +644,9 @@ void __init cred_init(void)
*/
struct cred *prepare_kernel_cred(struct task_struct *daemon)
{
+#ifdef CONFIG_KEYS
+ struct thread_group_cred *tgcred;
+#endif
const struct cred *old;
struct cred *new;
@@ -651,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (!new)
return NULL;
+#ifdef CONFIG_KEYS
+ tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+ if (!tgcred) {
+ kmem_cache_free(cred_jar, new);
+ return NULL;
+ }
+#endif
+
kdebug("prepare_kernel_cred() alloc %p", new);
if (daemon)
@@ -667,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
get_group_info(new->group_info);
#ifdef CONFIG_KEYS
- atomic_inc(&init_tgcred.usage);
- new->tgcred = &init_tgcred;
+ atomic_set(&tgcred->usage, 1);
+ spin_lock_init(&tgcred->lock);
+ tgcred->process_keyring = NULL;
+ tgcred->session_keyring = NULL;
+ new->tgcred = tgcred;
new->request_key_auth = NULL;
new->thread_keyring = NULL;
new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0f857782d06f..d1a1bee35228 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -29,6 +29,7 @@
#include <linux/hardirq.h>
#include <linux/rculist.h>
#include <linux/uaccess.h>
+#include <linux/suspend.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
#include <linux/kernel_stat.h>
@@ -5758,6 +5759,7 @@ struct pmu *perf_init_event(struct perf_event *event)
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
if (pmu) {
+ event->pmu = pmu;
ret = pmu->event_init(event);
if (ret)
pmu = ERR_PTR(ret);
@@ -5765,6 +5767,7 @@ struct pmu *perf_init_event(struct perf_event *event)
}
list_for_each_entry_rcu(pmu, &pmus, entry) {
+ event->pmu = pmu;
ret = pmu->event_init(event);
if (!ret)
goto unlock;
@@ -5891,8 +5894,6 @@ done:
return ERR_PTR(err);
}
- event->pmu = pmu;
-
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
jump_label_inc(&perf_sched_events);
@@ -6852,7 +6853,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
- if (swhash->hlist_refcount > 0) {
+ if (swhash->hlist_refcount > 0 && !swhash->swevent_hlist) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -6941,7 +6942,14 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
{
unsigned int cpu = (long)hcpu;
- switch (action & ~CPU_TASKS_FROZEN) {
+ /*
+ * Ignore suspend/resume action, the perf_pm_notifier will
+ * take care of that.
+ */
+ if (action & CPU_TASKS_FROZEN)
+ return NOTIFY_OK;
+
+ switch (action) {
case CPU_UP_PREPARE:
case CPU_DOWN_FAILED:
@@ -6960,6 +6968,90 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
return NOTIFY_OK;
}
+static void perf_pm_resume_cpu(void *unused)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int idx;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ ctx = cpuctx->task_ctx;
+
+ perf_ctx_lock(cpuctx, ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ if (ctx)
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, ctx);
+ }
+ srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static void perf_pm_suspend_cpu(void *unused)
+{
+ struct perf_cpu_context *cpuctx;
+ struct perf_event_context *ctx;
+ struct pmu *pmu;
+ int idx;
+
+ idx = srcu_read_lock(&pmus_srcu);
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+ ctx = cpuctx->task_ctx;
+
+ perf_ctx_lock(cpuctx, ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ perf_event_sched_in(cpuctx, ctx, current);
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ perf_ctx_unlock(cpuctx, ctx);
+ }
+ srcu_read_unlock(&pmus_srcu, idx);
+}
+
+static int perf_resume(void)
+{
+ get_online_cpus();
+ smp_call_function(perf_pm_resume_cpu, NULL, 1);
+ put_online_cpus();
+
+ return NOTIFY_OK;
+}
+
+static int perf_suspend(void)
+{
+ get_online_cpus();
+ smp_call_function(perf_pm_suspend_cpu, NULL, 1);
+ put_online_cpus();
+
+ return NOTIFY_OK;
+}
+
+static int perf_pm(struct notifier_block *self, unsigned long action, void *ptr)
+{
+ switch (action) {
+ case PM_POST_HIBERNATION:
+ case PM_POST_SUSPEND:
+ return perf_resume();
+ case PM_HIBERNATION_PREPARE:
+ case PM_SUSPEND_PREPARE:
+ return perf_suspend();
+ default:
+ return NOTIFY_DONE;
+ }
+}
+
+static struct notifier_block perf_pm_notifier = {
+ .notifier_call = perf_pm,
+};
+
void __init perf_event_init(void)
{
int ret;
@@ -6974,6 +7066,7 @@ void __init perf_event_init(void)
perf_tp_register();
perf_cpu_notifier(perf_cpu_notify);
register_reboot_notifier(&perf_reboot_notifier);
+ register_pm_notifier(&perf_pm_notifier);
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6a..66a594e8ad2f 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -67,7 +67,7 @@ static void fake_signal_wake_up(struct task_struct *p)
unsigned long flags;
spin_lock_irqsave(&p->sighand->siglock, flags);
- signal_wake_up(p, 0);
+ signal_wake_up(p, 1);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 11cbe052b2e8..1511dff0cfd6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
{
struct task_struct *new_owner;
struct futex_pi_state *pi_state = this->pi_state;
- u32 curval, newval;
+ u32 uninitialized_var(curval), newval;
if (!pi_state)
return -EINVAL;
@@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
{
- u32 oldval;
+ u32 uninitialized_var(oldval);
/*
* There is no waiter, so we unlock the futex. The owner died
@@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
struct task_struct *oldowner = pi_state->owner;
- u32 uval, curval, newval;
+ u32 uval, uninitialized_var(curval), newval;
int ret;
/* Owner died? */
@@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
*
* Returns:
* 0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2481,7 +2481,7 @@ err_unlock:
*/
int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
- u32 uval, nval, mval;
+ u32 uval, uninitialized_var(nval), mval;
retry:
if (get_user(uval, uaddr))
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index dc5114b4c16c..f7c543a801d9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -26,7 +26,7 @@
int irq_set_chip(unsigned int irq, struct irq_chip *chip)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return -EINVAL;
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip);
int irq_set_irq_type(unsigned int irq, unsigned int type)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
int ret = 0;
if (!desc)
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
int irq_set_handler_data(unsigned int irq, void *data)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return -EINVAL;
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
int irq_set_chip_data(unsigned int irq, void *data)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return -EINVAL;
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc)
}
}
+void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
+{
+ if (desc->irq_data.chip->irq_enable)
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ cpumask_set_cpu(cpu, desc->percpu_enabled);
+}
+
+void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
+{
+ if (desc->irq_data.chip->irq_disable)
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ cpumask_clear_cpu(cpu, desc->percpu_enabled);
+}
+
static inline void mask_ack_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
chip->irq_eoi(&desc->irq_data);
}
+/**
+ * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements. Same as
+ * handle_percpu_irq() above but with the following extras:
+ *
+ * action->percpu_dev_id is a pointer to percpu variables which
+ * contain the real device id for the cpu on which this handler is
+ * called
+ */
+void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+ irqreturn_t res;
+
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ if (chip->irq_ack)
+ chip->irq_ack(&desc->irq_data);
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+}
+
void
__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
if (!desc)
return;
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431447d7..a73dd6c7372d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc);
extern void irq_shutdown(struct irq_desc *desc);
extern void irq_enable(struct irq_desc *desc);
extern void irq_disable(struct irq_desc *desc);
+extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
+extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
extern void mask_irq(struct irq_desc *desc);
extern void unmask_irq(struct irq_desc *desc);
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
}
+#define _IRQ_DESC_CHECK (1 << 0)
+#define _IRQ_DESC_PERCPU (1 << 1)
+
+#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
+#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
+
struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check);
void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
{
- return __irq_get_desc_lock(irq, flags, true);
+ return __irq_get_desc_lock(irq, flags, true, check);
}
static inline void
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
}
static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
{
- return __irq_get_desc_lock(irq, flags, false);
+ return __irq_get_desc_lock(irq, flags, false, check);
}
static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 039b889ea053..1550e8447a16 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -424,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset)
}
struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+ unsigned int check)
{
struct irq_desc *desc = irq_to_desc(irq);
if (desc) {
+ if (check & _IRQ_DESC_CHECK) {
+ if ((check & _IRQ_DESC_PERCPU) &&
+ !irq_settings_is_per_cpu_devid(desc))
+ return NULL;
+
+ if (!(check & _IRQ_DESC_PERCPU) &&
+ irq_settings_is_per_cpu_devid(desc))
+ return NULL;
+ }
+
if (bus)
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -443,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
chip_bus_sync_unlock(desc);
}
+int irq_set_percpu_devid(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return -EINVAL;
+
+ if (desc->percpu_enabled)
+ return -EINVAL;
+
+ desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
+
+ if (!desc->percpu_enabled)
+ return -ENOMEM;
+
+ irq_set_percpu_devid_flags(irq);
+ return 0;
+}
+
/**
* dynamic_irq_cleanup - cleanup a dynamically allocated irq
* @irq: irq number to initialize
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9b956fa20308..67ce837ae52c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
static int __disable_irq_nosync(unsigned int irq)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return -EINVAL;
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
void enable_irq(unsigned int irq)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
if (!desc)
return;
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
struct irq_desc *desc = irq_to_desc(irq);
int ret = -ENXIO;
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (desc->irq_data.chip->irq_set_wake)
ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
@@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
int ret = 0;
if (!desc)
@@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
int can_request_irq(unsigned int irq, unsigned long irqflags)
{
unsigned long flags;
- struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
int canrequest = 0;
if (!desc)
@@ -1118,6 +1121,8 @@ int setup_irq(unsigned int irq, struct irqaction *act)
int retval;
struct irq_desc *desc = irq_to_desc(irq);
+ if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ return -EINVAL;
chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
chip_bus_sync_unlock(desc);
@@ -1126,7 +1131,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
}
EXPORT_SYMBOL_GPL(setup_irq);
- /*
+/*
* Internal function to unregister an irqaction - used to free
* regular and special interrupts that are part of the architecture.
*/
@@ -1224,7 +1229,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
*/
void remove_irq(unsigned int irq, struct irqaction *act)
{
- __free_irq(irq, act->dev_id);
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+ __free_irq(irq, act->dev_id);
}
EXPORT_SYMBOL_GPL(remove_irq);
@@ -1246,7 +1254,7 @@ void free_irq(unsigned int irq, void *dev_id)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (!desc)
+ if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return;
#ifdef CONFIG_SMP
@@ -1324,7 +1332,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (!desc)
return -EINVAL;
- if (!irq_settings_can_request(desc))
+ if (!irq_settings_can_request(desc) ||
+ WARN_ON(irq_settings_is_per_cpu_devid(desc)))
return -EINVAL;
if (!handler) {
@@ -1409,3 +1418,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
return !ret ? IRQC_IS_HARDIRQ : ret;
}
EXPORT_SYMBOL_GPL(request_any_context_irq);
+
+void enable_percpu_irq(unsigned int irq, unsigned int type)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+ if (!desc)
+ return;
+
+ type &= IRQ_TYPE_SENSE_MASK;
+ if (type != IRQ_TYPE_NONE) {
+ int ret;
+
+ ret = __irq_set_trigger(desc, irq, type);
+
+ if (ret) {
+ WARN(1, "failed to set type for IRQ%d\n", irq);
+ goto out;
+ }
+ }
+
+ irq_percpu_enable(desc, cpu);
+out:
+ irq_put_desc_unlock(desc, flags);
+}
+
+void disable_percpu_irq(unsigned int irq)
+{
+ unsigned int cpu = smp_processor_id();
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+
+ if (!desc)
+ return;
+
+ irq_percpu_disable(desc, cpu);
+ irq_put_desc_unlock(desc, flags);
+}
+
+/*
+ * Internal function to unregister a percpu irqaction.
+ */
+static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ struct irqaction *action;
+ unsigned long flags;
+
+ WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+
+ if (!desc)
+ return NULL;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ action = desc->action;
+ if (!action || action->percpu_dev_id != dev_id) {
+ WARN(1, "Trying to free already-free IRQ %d\n", irq);
+ goto bad;
+ }
+
+ if (!cpumask_empty(desc->percpu_enabled)) {
+ WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+ irq, cpumask_first(desc->percpu_enabled));
+ goto bad;
+ }
+
+ /* Found it - now remove it from the list of entries: */
+ desc->action = NULL;
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ unregister_handler_proc(irq, action);
+
+ module_put(desc->owner);
+ return action;
+
+bad:
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return NULL;
+}
+
+/**
+ * remove_percpu_irq - free a per-cpu interrupt
+ * @irq: Interrupt line to free
+ * @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc && irq_settings_is_per_cpu_devid(desc))
+ __free_percpu_irq(irq, act->percpu_dev_id);
+}
+
+/**
+ * free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ * @irq: Interrupt line to free
+ * @dev_id: Device identity to free
+ *
+ * Remove a percpu interrupt handler. The handler is removed, but
+ * the interrupt line is not disabled. This must be done on each
+ * CPU before calling this function. The function does not return
+ * until any executing interrupts for this IRQ have completed.
+ *
+ * This function must not be called from interrupt context.
+ */
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return;
+
+ chip_bus_lock(desc);
+ kfree(__free_percpu_irq(irq, dev_id));
+ chip_bus_sync_unlock(desc);
+}
+
+/**
+ * setup_percpu_irq - setup a per-cpu interrupt
+ * @irq: Interrupt line to setup
+ * @act: irqaction for the interrupt
+ *
+ * Used to statically setup per-cpu interrupts in the early boot process.
+ */
+int setup_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int retval;
+
+ if (!desc || !irq_settings_is_per_cpu_devid(desc))
+ return -EINVAL;
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, act);
+ chip_bus_sync_unlock(desc);
+
+ return retval;
+}
+
+/**
+ * request_percpu_irq - allocate a percpu interrupt line
+ * @irq: Interrupt line to allocate
+ * @handler: Function to be called when the IRQ occurs.
+ * @devname: An ascii name for the claiming device
+ * @dev_id: A percpu cookie passed back to the handler function
+ *
+ * This call allocates interrupt resources, but doesn't
+ * automatically enable the interrupt. It has to be done on each
+ * CPU using enable_percpu_irq().
+ *
+ * Dev_id must be globally unique. It is a per-cpu variable, and
+ * the handler gets called with the interrupted CPU's instance of
+ * that variable.
+ */
+int request_percpu_irq(unsigned int irq, irq_handler_t handler,
+ const char *devname, void __percpu *dev_id)
+{
+ struct irqaction *action;
+ struct irq_desc *desc;
+ int retval;
+
+ if (!dev_id)
+ return -EINVAL;
+
+ desc = irq_to_desc(irq);
+ if (!desc || !irq_settings_can_request(desc) ||
+ !irq_settings_is_per_cpu_devid(desc))
+ return -EINVAL;
+
+ action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+ if (!action)
+ return -ENOMEM;
+
+ action->handler = handler;
+ action->flags = IRQF_PERCPU;
+ action->name = devname;
+ action->percpu_dev_id = dev_id;
+
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, action);
+ chip_bus_sync_unlock(desc);
+
+ if (retval)
+ kfree(action);
+
+ return retval;
+}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c9877..15e53b1766a6 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/syscore_ops.h>
#include "internals.h"
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
-/**
- * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
- *
- * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQS_SUSPENDED flag set.
- */
-void resume_device_irqs(void)
+static void resume_irqs(bool want_early)
{
struct irq_desc *desc;
int irq;
for_each_irq_desc(irq, desc) {
unsigned long flags;
+ bool is_early = desc->action &&
+ desc->action->flags & IRQF_EARLY_RESUME;
+
+ if (is_early != want_early)
+ continue;
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
+
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+ resume_irqs(true);
+}
+
+static struct syscore_ops irq_pm_syscore_ops = {
+ .resume = irq_pm_syscore_resume,
+};
+
+static int __init irq_pm_init_ops(void)
+{
+ register_syscore_ops(&irq_pm_syscore_ops);
+ return 0;
+}
+
+device_initcall(irq_pm_init_ops);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+ resume_irqs(false);
+}
EXPORT_SYMBOL_GPL(resume_device_irqs);
/**
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f1667833d444..1162f1030f18 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,6 +13,7 @@ enum {
_IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
+ _IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -24,6 +25,7 @@ enum {
#define IRQ_NOTHREAD GOT_YOU_MORON
#define IRQ_NOAUTOEN GOT_YOU_MORON
#define IRQ_NESTED_THREAD GOT_YOU_MORON
+#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
return desc->status_use_accessors & _IRQ_PER_CPU;
}
+static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
+}
+
static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
{
desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c1305..a4bea97c75b6 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
atomic_inc(&kmod_concurrent);
if (atomic_read(&kmod_concurrent) > max_modprobes) {
/* We may be blaming an innocent here, but unlikely */
- if (kmod_loop_msg++ < 5)
+ if (kmod_loop_msg < 5) {
printk(KERN_ERR
"request_module: runaway loop modprobe %s\n",
module_name);
+ kmod_loop_msg++;
+ }
atomic_dec(&kmod_concurrent);
return -ENOMEM;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb985..2f193d0ba7f2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
- spinlock_t lock ____cacheline_aligned_in_smp;
+ raw_spinlock_t lock ____cacheline_aligned_in_smp;
} kretprobe_table_locks[KPROBE_TABLE_SIZE];
-static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
{
return &(kretprobe_table_locks[hash].lock);
}
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
hlist_del(&ri->hlist);
INIT_HLIST_NODE(&ri->hlist);
if (likely(rp)) {
- spin_lock(&rp->lock);
+ raw_spin_lock(&rp->lock);
hlist_add_head(&ri->hlist, &rp->free_instances);
- spin_unlock(&rp->lock);
+ raw_spin_unlock(&rp->lock);
} else
/* Unregistering */
hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
__acquires(hlist_lock)
{
unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- spinlock_t *hlist_lock;
+ raw_spinlock_t *hlist_lock;
*head = &kretprobe_inst_table[hash];
hlist_lock = kretprobe_table_lock_ptr(hash);
- spin_lock_irqsave(hlist_lock, *flags);
+ raw_spin_lock_irqsave(hlist_lock, *flags);
}
static void __kprobes kretprobe_table_lock(unsigned long hash,
unsigned long *flags)
__acquires(hlist_lock)
{
- spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- spin_lock_irqsave(hlist_lock, *flags);
+ raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ raw_spin_lock_irqsave(hlist_lock, *flags);
}
void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
__releases(hlist_lock)
{
unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
- spinlock_t *hlist_lock;
+ raw_spinlock_t *hlist_lock;
hlist_lock = kretprobe_table_lock_ptr(hash);
- spin_unlock_irqrestore(hlist_lock, *flags);
+ raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
static void __kprobes kretprobe_table_unlock(unsigned long hash,
unsigned long *flags)
__releases(hlist_lock)
{
- spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
- spin_unlock_irqrestore(hlist_lock, *flags);
+ raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+ raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
/*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
/*TODO: consider to only swap the RA after the last pre_handler fired */
hash = hash_ptr(current, KPROBE_HASH_BITS);
- spin_lock_irqsave(&rp->lock, flags);
+ raw_spin_lock_irqsave(&rp->lock, flags);
if (!hlist_empty(&rp->free_instances)) {
ri = hlist_entry(rp->free_instances.first,
struct kretprobe_instance, hlist);
hlist_del(&ri->hlist);
- spin_unlock_irqrestore(&rp->lock, flags);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
ri->rp = rp;
ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
kretprobe_table_unlock(hash, &flags);
} else {
rp->nmissed++;
- spin_unlock_irqrestore(&rp->lock, flags);
+ raw_spin_unlock_irqrestore(&rp->lock, flags);
}
return 0;
}
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
rp->maxactive = num_possible_cpus();
#endif
}
- spin_lock_init(&rp->lock);
+ raw_spin_lock_init(&rp->lock);
INIT_HLIST_HEAD(&rp->free_instances);
for (i = 0; i < rp->maxactive; i++) {
inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
INIT_HLIST_HEAD(&kprobe_table[i]);
INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
- spin_lock_init(&(kretprobe_table_locks[i].lock));
+ raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
}
/*
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e10413..4ac8ebfcab59 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -58,7 +58,7 @@
#include <linux/list.h>
#include <linux/stacktrace.h>
-static DEFINE_SPINLOCK(latency_lock);
+static DEFINE_RAW_SPINLOCK(latency_lock);
#define MAXLR 128
static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
if (!latencytop_enabled)
return;
- spin_lock_irqsave(&latency_lock, flags);
+ raw_spin_lock_irqsave(&latency_lock, flags);
memset(&p->latency_record, 0, sizeof(p->latency_record));
p->latency_record_count = 0;
- spin_unlock_irqrestore(&latency_lock, flags);
+ raw_spin_unlock_irqrestore(&latency_lock, flags);
}
static void clear_global_latency_tracing(void)
{
unsigned long flags;
- spin_lock_irqsave(&latency_lock, flags);
+ raw_spin_lock_irqsave(&latency_lock, flags);
memset(&latency_record, 0, sizeof(latency_record));
- spin_unlock_irqrestore(&latency_lock, flags);
+ raw_spin_unlock_irqrestore(&latency_lock, flags);
}
static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
lat.max = usecs;
store_stacktrace(tsk, &lat);
- spin_lock_irqsave(&latency_lock, flags);
+ raw_spin_lock_irqsave(&latency_lock, flags);
account_global_scheduler_latency(tsk, &lat);
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
- spin_unlock_irqrestore(&latency_lock, flags);
+ raw_spin_unlock_irqrestore(&latency_lock, flags);
}
static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 91d67ce3a8d5..e69434b070da 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -96,8 +96,13 @@ static int graph_lock(void)
static inline int graph_unlock(void)
{
- if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
+ if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+ /*
+ * The lockdep graph lock isn't locked while we expect it to
+ * be, we're confused now, bye!
+ */
return DEBUG_LOCKS_WARN_ON(1);
+ }
current->lockdep_recursion--;
arch_spin_unlock(&lockdep_lock);
@@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
static inline struct lock_class *hlock_class(struct held_lock *hlock)
{
if (!hlock->class_idx) {
+ /*
+ * Someone passed in garbage, we give up.
+ */
DEBUG_LOCKS_WARN_ON(1);
return NULL;
}
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
*/
list_for_each_entry(class, hash_head, hash_entry) {
if (class->key == key) {
+ /*
+ * Huh! same key, different name? Did someone trample
+ * on some memory? We're most confused.
+ */
WARN_ON_ONCE(class->name != lock->name);
return class;
}
@@ -800,6 +812,10 @@ out_unlock_set:
else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
lock->class_cache[subclass] = class;
+ /*
+ * Hash collision, did we smoke some? We found a class with a matching
+ * hash but the subclass -- which is hashed in -- didn't match.
+ */
if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
return NULL;
@@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries);
+ WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
lock->parent = parent;
lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
@@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
unsigned long nr;
nr = lock - list_entries;
- WARN_ON(nr >= nr_list_entries);
+ WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
if (debug_locks_silent)
return 0;
- printk("\n=======================================================\n");
- printk( "[ INFO: possible circular locking dependency detected ]\n");
+ printk("\n");
+ printk("======================================================\n");
+ printk("[ INFO: possible circular locking dependency detected ]\n");
print_kernel_version();
- printk( "-------------------------------------------------------\n");
+ printk("-------------------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(check_src);
@@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret)
if (!debug_locks_off_graph_unlock())
return 0;
+ /*
+ * Breadth-first-search failed, graph got corrupted?
+ */
WARN(1, "lockdep bfs error:%d\n", ret);
return 0;
@@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n======================================================\n");
- printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+ printk("\n");
+ printk("======================================================\n");
+ printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
irqclass, irqclass);
print_kernel_version();
- printk( "------------------------------------------------------\n");
+ printk("------------------------------------------------------\n");
printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
curr->comm, task_pid_nr(curr),
curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n=============================================\n");
- printk( "[ INFO: possible recursive locking detected ]\n");
+ printk("\n");
+ printk("=============================================\n");
+ printk("[ INFO: possible recursive locking detected ]\n");
print_kernel_version();
- printk( "---------------------------------------------\n");
+ printk("---------------------------------------------\n");
printk("%s/%d is trying to acquire lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(next);
@@ -1944,6 +1966,11 @@ out_bug:
if (!debug_locks_off_graph_unlock())
return 0;
+ /*
+ * Clearly we all shouldn't be here, but since we made it we
+ * can reliable say we messed up our state. See the above two
+ * gotos for reasons why we could possibly end up here.
+ */
WARN_ON(1);
return 0;
@@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct held_lock *hlock_curr, *hlock_next;
int i, j;
+ /*
+ * We might need to take the graph lock, ensure we've got IRQs
+ * disabled to make this an IRQ-safe lock.. for recursion reasons
+ * lockdep won't complain about its own locking errors.
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
/*
@@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr)
hlock = curr->held_locks + i;
if (chain_key != hlock->prev_chain_key) {
debug_locks_off();
+ /*
+ * We got mighty confused, our chain keys don't match
+ * with what we expect, someone trample on our task state?
+ */
WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
curr->lockdep_depth, i,
(unsigned long long)chain_key,
@@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr)
return;
}
id = hlock->class_idx - 1;
+ /*
+ * Whoops ran out of static storage again?
+ */
if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
return;
@@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr)
}
if (chain_key != curr->curr_chain_key) {
debug_locks_off();
+ /*
+ * More smoking hash instead of calculating it, damn see these
+ * numbers float.. I bet that a pink elephant stepped on my memory.
+ */
WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
curr->lockdep_depth, i,
(unsigned long long)chain_key,
@@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n=================================\n");
- printk( "[ INFO: inconsistent lock state ]\n");
+ printk("\n");
+ printk("=================================\n");
+ printk("[ INFO: inconsistent lock state ]\n");
print_kernel_version();
- printk( "---------------------------------\n");
+ printk("---------------------------------\n");
printk("inconsistent {%s} -> {%s} usage.\n",
usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr,
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
- printk("\n=========================================================\n");
- printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
+ printk("\n");
+ printk("=========================================================\n");
+ printk("[ INFO: possible irq lock inversion dependency detected ]\n");
print_kernel_version();
- printk( "---------------------------------------------------------\n");
+ printk("---------------------------------------------------------\n");
printk("%s/%d just changed the state of lock:\n",
curr->comm, task_pid_nr(curr));
print_lock(this);
@@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
return;
}
+ /*
+ * We're enabling irqs and according to our state above irqs weren't
+ * already enabled, yet we find the hardware thinks they are in fact
+ * enabled.. someone messed up their IRQ state tracing.
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
+ /*
+ * See the fine text that goes along with this variable definition.
+ */
if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
return;
+ /*
+ * Can't allow enabling interrupts while in an interrupt handler,
+ * that's general bad form and such. Recursion, limited stack etc..
+ */
if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
return;
@@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
+ /*
+ * So we're supposed to get called after you mask local IRQs, but for
+ * some reason the hardware doesn't quite think you did a proper job.
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
@@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
+ /*
+ * We fancy IRQs being disabled here, see softirq.c, avoids
+ * funny state and nesting things.
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
@@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip)
if (unlikely(!debug_locks || current->lockdep_recursion))
return;
+ /*
+ * We fancy IRQs being disabled here, see softirq.c
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return;
@@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip)
curr->softirq_disable_ip = ip;
curr->softirq_disable_event = ++curr->irq_events;
debug_atomic_inc(softirqs_off_events);
+ /*
+ * Whoops, we wanted softirqs off, so why aren't they?
+ */
DEBUG_LOCKS_WARN_ON(!softirq_count());
} else
debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
if (!(gfp_mask & __GFP_FS))
return;
+ /*
+ * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
+ */
if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
return;
@@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr,
return 0;
}
-#else
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
static inline
int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- WARN_ON(1);
+ WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
return 1;
}
@@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
{
}
-#endif
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
/*
* Mark a lock with a usage bit, and validate the state transition:
@@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
lock->cpu = raw_smp_processor_id();
#endif
+ /*
+ * Can't be having no nameless bastards around this place!
+ */
if (DEBUG_LOCKS_WARN_ON(!name)) {
lock->name = "NULL";
return;
@@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
lock->name = name;
+ /*
+ * No key, no joy, we need to hash something.
+ */
if (DEBUG_LOCKS_WARN_ON(!key))
return;
/*
@@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
*/
if (!static_obj(key)) {
printk("BUG: key %p not in .data!\n", key);
+ /*
+ * What it says above ^^^^^, I suggest you read it.
+ */
DEBUG_LOCKS_WARN_ON(1);
return;
}
@@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (unlikely(!debug_locks))
return 0;
+ /*
+ * Lockdep should run with IRQs disabled, otherwise we could
+ * get an interrupt which would want to take locks, which would
+ * end up in lockdep and have you got a head-ache already?
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
@@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* dependency checks are done)
*/
depth = curr->lockdep_depth;
+ /*
+ * Ran out of static storage for our per-task lock stack again have we?
+ */
if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
return 0;
@@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
}
hlock = curr->held_locks + depth;
+ /*
+ * Plain impossible, we just registered it and checked it weren't no
+ * NULL like.. I bet this mushroom I ate was good!
+ */
if (DEBUG_LOCKS_WARN_ON(!class))
return 0;
hlock->class_idx = class_idx;
@@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* the hash, not class->key.
*/
id = class - lock_classes;
+ /*
+ * Whoops, we did it again.. ran straight out of our static allocation.
+ */
if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
return 0;
chain_key = curr->curr_chain_key;
if (!depth) {
+ /*
+ * How can we have a chain hash when we ain't got no keys?!
+ */
if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
return 0;
chain_head = 1;
@@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
if (debug_locks_silent)
return 0;
- printk("\n=====================================\n");
- printk( "[ BUG: bad unlock balance detected! ]\n");
- printk( "-------------------------------------\n");
+ printk("\n");
+ printk("=====================================\n");
+ printk("[ BUG: bad unlock balance detected! ]\n");
+ printk("-------------------------------------\n");
printk("%s/%d is trying to release lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
@@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
{
if (unlikely(!debug_locks))
return 0;
+ /*
+ * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
+ */
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
@@ -3120,6 +3225,11 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
if (!class)
return 0;
+ /*
+ * References, but not a lock we're actually ref-counting?
+ * State got messed up, follow the sites that change ->references
+ * and try to make sense of it.
+ */
if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
return 0;
@@ -3142,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
int i;
depth = curr->lockdep_depth;
+ /*
+ * This function is about (re)setting the class of a held lock,
+ * yet we're not actually holding any locks. Naughty user!
+ */
if (DEBUG_LOCKS_WARN_ON(!depth))
return 0;
@@ -3177,6 +3291,10 @@ found_it:
return 0;
}
+ /*
+ * I took it apart and put it back together again, except now I have
+ * these 'spare' parts.. where shall I put them.
+ */
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
return 0;
return 1;
@@ -3201,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr,
* of held locks:
*/
depth = curr->lockdep_depth;
+ /*
+ * So we're all set to release this lock.. wait what lock? We don't
+ * own any locks, you've been drinking again?
+ */
if (DEBUG_LOCKS_WARN_ON(!depth))
return 0;
@@ -3253,6 +3375,10 @@ found_it:
return 0;
}
+ /*
+ * We had N bottles of beer on the wall, we drank one, but now
+ * there's not N-1 bottles of beer left on the wall...
+ */
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
return 0;
return 1;
@@ -3283,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr,
return lock_release_non_nested(curr, lock, ip);
curr->lockdep_depth--;
+ /*
+ * No more locks, but somehow we've got hash left over, who left it?
+ */
if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
return 0;
@@ -3365,10 +3494,13 @@ static void check_flags(unsigned long flags)
* check if not in hardirq contexts:
*/
if (!hardirq_count()) {
- if (softirq_count())
+ if (softirq_count()) {
+ /* like the above, but with softirqs */
DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
- else
+ } else {
+ /* lick the above, does it taste good? */
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+ }
}
if (!debug_locks)
@@ -3478,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
if (debug_locks_silent)
return 0;
- printk("\n=================================\n");
- printk( "[ BUG: bad contention detected! ]\n");
- printk( "---------------------------------\n");
+ printk("\n");
+ printk("=================================\n");
+ printk("[ BUG: bad contention detected! ]\n");
+ printk("---------------------------------\n");
printk("%s/%d is trying to contend lock (",
curr->comm, task_pid_nr(curr));
print_lockdep_cache(lock);
@@ -3506,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
int i, contention_point, contending_point;
depth = curr->lockdep_depth;
+ /*
+ * Whee, we contended on this lock, except it seems we're not
+ * actually trying to acquire anything much at all..
+ */
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
@@ -3555,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
int i, cpu;
depth = curr->lockdep_depth;
+ /*
+ * Yay, we acquired ownership of this lock we didn't try to
+ * acquire, how the heck did that happen?
+ */
if (DEBUG_LOCKS_WARN_ON(!depth))
return;
@@ -3759,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
match |= class == lock->class_cache[j];
if (unlikely(match)) {
- if (debug_locks_off_graph_unlock())
+ if (debug_locks_off_graph_unlock()) {
+ /*
+ * We all just reset everything, how did it match?
+ */
WARN_ON(1);
+ }
goto out_restore;
}
}
@@ -3839,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
if (debug_locks_silent)
return;
- printk("\n=========================\n");
- printk( "[ BUG: held lock freed! ]\n");
- printk( "-------------------------\n");
+ printk("\n");
+ printk("=========================\n");
+ printk("[ BUG: held lock freed! ]\n");
+ printk("-------------------------\n");
printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
print_lock(hlock);
@@ -3895,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr)
if (debug_locks_silent)
return;
- printk("\n=====================================\n");
- printk( "[ BUG: lock held at task exit time! ]\n");
- printk( "-------------------------------------\n");
+ printk("\n");
+ printk("=====================================\n");
+ printk("[ BUG: lock held at task exit time! ]\n");
+ printk("-------------------------------------\n");
printk("%s/%d is exiting with locks still held!\n",
curr->comm, task_pid_nr(curr));
lockdep_print_held_locks(curr);
@@ -3991,16 +4138,17 @@ void lockdep_sys_exit(void)
if (unlikely(curr->lockdep_depth)) {
if (!debug_locks_off())
return;
- printk("\n================================================\n");
- printk( "[ BUG: lock held when returning to user space! ]\n");
- printk( "------------------------------------------------\n");
+ printk("\n");
+ printk("================================================\n");
+ printk("[ BUG: lock held when returning to user space! ]\n");
+ printk("------------------------------------------------\n");
printk("%s/%d is leaving the kernel with locks still held!\n",
curr->comm, curr->pid);
lockdep_print_held_locks(curr);
}
}
-void lockdep_rcu_dereference(const char *file, const int line)
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
{
struct task_struct *curr = current;
@@ -4009,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line)
return;
#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
/* Note: the following can be executed concurrently, so be careful. */
- printk("\n===================================================\n");
- printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
- printk( "---------------------------------------------------\n");
- printk("%s:%d invoked rcu_dereference_check() without protection!\n",
- file, line);
+ printk("\n");
+ printk("===============================\n");
+ printk("[ INFO: suspicious RCU usage. ]\n");
+ printk("-------------------------------\n");
+ printk("%s:%d %s!\n", file, line, s);
printk("\nother info that might help us debug this:\n\n");
printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
dump_stack();
}
-EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f843..93342d992f34 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3487,50 +3487,3 @@ void module_layout(struct module *mod,
}
EXPORT_SYMBOL(module_layout);
#endif
-
-#ifdef CONFIG_TRACEPOINTS
-void module_update_tracepoints(void)
-{
- struct module *mod;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(mod, &modules, list)
- if (!mod->taints)
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
- mutex_unlock(&module_mutex);
-}
-
-/*
- * Returns 0 if current not found.
- * Returns 1 if current found.
- */
-int module_get_iter_tracepoints(struct tracepoint_iter *iter)
-{
- struct module *iter_mod;
- int found = 0;
-
- mutex_lock(&module_mutex);
- list_for_each_entry(iter_mod, &modules, list) {
- if (!iter_mod->taints) {
- /*
- * Sorted module list
- */
- if (iter_mod < iter->module)
- continue;
- else if (iter_mod > iter->module)
- iter->tracepoint = NULL;
- found = tracepoint_get_iter_range(&iter->tracepoint,
- iter_mod->tracepoints_ptrs,
- iter_mod->tracepoints_ptrs
- + iter_mod->num_tracepoints);
- if (found) {
- iter->module = iter_mod;
- break;
- }
- }
- }
- mutex_unlock(&module_mutex);
- return found;
-}
-#endif
diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142a..821788947e40 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param)
}
}
-static inline char dash2underscore(char c)
+static char dash2underscore(char c)
{
if (c == '-')
return '_';
return c;
}
-static inline int parameq(const char *input, const char *paramname)
+bool parameqn(const char *a, const char *b, size_t n)
{
- unsigned int i;
- for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
- if (input[i] == '\0')
- return 1;
- return 0;
+ size_t i;
+
+ for (i = 0; i < n; i++) {
+ if (dash2underscore(a[i]) != dash2underscore(b[i]))
+ return false;
+ }
+ return true;
+}
+
+bool parameq(const char *a, const char *b)
+{
+ return parameqn(a, b, strlen(a)+1);
}
static int parse_one(char *param,
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b21..8cafe7e72ad2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task);
*/
struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
{
- rcu_lockdep_assert(rcu_read_lock_held());
+ rcu_lockdep_assert(rcu_read_lock_held(),
+ "find_task_by_pid_ns() needs rcu_read_lock()"
+ " protection");
return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c8008dd58ef2..e7cb76dc18f5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
struct task_cputime sum;
unsigned long flags;
- spin_lock_irqsave(&cputimer->lock, flags);
if (!cputimer->running) {
- cputimer->running = 1;
/*
* The POSIX timer interface allows for absolute time expiry
* values through the TIMER_ABSTIME flag, therefore we have
@@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
* it.
*/
thread_group_cputime(tsk, &sum);
+ raw_spin_lock_irqsave(&cputimer->lock, flags);
+ cputimer->running = 1;
update_gt_cputime(&cputimer->cputime, &sum);
- }
+ } else
+ raw_spin_lock_irqsave(&cputimer->lock, flags);
*times = cputimer->cputime;
- spin_unlock_irqrestore(&cputimer->lock, flags);
+ raw_spin_unlock_irqrestore(&cputimer->lock, flags);
}
/*
@@ -998,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig)
struct thread_group_cputimer *cputimer = &sig->cputimer;
unsigned long flags;
- spin_lock_irqsave(&cputimer->lock, flags);
+ raw_spin_lock_irqsave(&cputimer->lock, flags);
cputimer->running = 0;
- spin_unlock_irqrestore(&cputimer->lock, flags);
+ raw_spin_unlock_irqrestore(&cputimer->lock, flags);
}
static u32 onecputick;
@@ -1290,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
if (sig->cputimer.running) {
struct task_cputime group_sample;
- spin_lock(&sig->cputimer.lock);
+ raw_spin_lock(&sig->cputimer.lock);
group_sample = sig->cputimer.cputime;
- spin_unlock(&sig->cputimer.lock);
+ raw_spin_unlock(&sig->cputimer.lock);
if (task_cputime_expired(&group_sample, &sig->cputime_expires))
return 1;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 3744c594b19b..cedd9982306a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
select HIBERNATE_CALLBACKS
select LZO_COMPRESS
select LZO_DECOMPRESS
+ select CRC32
---help---
Enable the suspend to disk (STD) functionality, which is usually
called "hibernation" in user interfaces. STD checkpoints the
@@ -65,6 +66,9 @@ config HIBERNATION
For more information take a look at <file:Documentation/power/swsusp.txt>.
+config ARCH_SAVE_PAGE_KEYS
+ bool
+
config PM_STD_PARTITION
string "Default resume partition"
depends on HIBERNATION
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a90643..07e0e28ffba7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,8 +1,8 @@
ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
-obj-$(CONFIG_PM) += main.o
-obj-$(CONFIG_PM_SLEEP) += console.o
+obj-$(CONFIG_PM) += main.o qos.o
+obj-$(CONFIG_VT_CONSOLE_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af90156..b1dc456474b5 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
/*
- * drivers/power/process.c - Functions for saving/restoring console.
+ * Functions for saving/restoring console.
*
* Originally from swsusp.
*/
@@ -10,7 +10,6 @@
#include <linux/module.h>
#include "power.h"
-#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
vt_kmsg_redirect(orig_kmsg);
}
}
-#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece1..1c53f7fad5f7 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -14,6 +14,7 @@
#include <linux/reboot.h>
#include <linux/string.h>
#include <linux/device.h>
+#include <linux/async.h>
#include <linux/kmod.h>
#include <linux/delay.h>
#include <linux/fs.h>
@@ -29,12 +30,14 @@
#include "power.h"
-static int nocompress = 0;
-static int noresume = 0;
+static int nocompress;
+static int noresume;
+static int resume_wait;
+static int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
-int in_suspend __nosavedata = 0;
+int in_suspend __nosavedata;
enum {
HIBERNATION_INVALID,
@@ -334,13 +337,17 @@ int hibernation_snapshot(int platform_mode)
if (error)
goto Close;
- error = dpm_prepare(PMSG_FREEZE);
- if (error)
- goto Complete_devices;
-
/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();
if (error)
+ goto Close;
+
+ error = freeze_kernel_threads();
+ if (error)
+ goto Close;
+
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
goto Complete_devices;
suspend_console();
@@ -463,7 +470,7 @@ static int resume_target_kernel(bool platform_mode)
* @platform_mode: If set, use platform driver to prepare for the transition.
*
* This routine must be called with pm_mutex held. If it is successful, control
- * reappears in the restored target kernel in hibernation_snaphot().
+ * reappears in the restored target kernel in hibernation_snapshot().
*/
int hibernation_restore(int platform_mode)
{
@@ -650,6 +657,9 @@ int hibernate(void)
flags |= SF_PLATFORM_MODE;
if (nocompress)
flags |= SF_NOCOMPRESS_MODE;
+ else
+ flags |= SF_CRC32_MODE;
+
pr_debug("PM: writing image.\n");
error = swsusp_write(flags);
swsusp_free();
@@ -724,6 +734,12 @@ static int software_resume(void)
pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+ if (resume_delay) {
+ printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+ resume_delay);
+ ssleep(resume_delay);
+ }
+
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
if (!swsusp_resume_device) {
@@ -732,6 +748,13 @@ static int software_resume(void)
* to wait for this to finish.
*/
wait_for_device_probe();
+
+ if (resume_wait) {
+ while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
+ msleep(10);
+ async_synchronize_full();
+ }
+
/*
* We can't depend on SCSI devices being available after loading
* one of their modules until scsi_complete_async_scans() is
@@ -1060,7 +1083,21 @@ static int __init noresume_setup(char *str)
return 1;
}
+static int __init resumewait_setup(char *str)
+{
+ resume_wait = 1;
+ return 1;
+}
+
+static int __init resumedelay_setup(char *str)
+{
+ resume_delay = simple_strtoul(str, NULL, 0);
+ return 1;
+}
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
__setup("hibernate=", hibernate_setup);
+__setup("resumewait", resumewait_setup);
+__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f871964..a52e88425a31 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -12,6 +12,8 @@
#include <linux/string.h>
#include <linux/resume-trace.h>
#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include "power.h"
@@ -131,6 +133,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
power_attr(pm_test);
#endif /* CONFIG_PM_DEBUG */
+#ifdef CONFIG_DEBUG_FS
+static char *suspend_step_name(enum suspend_stat_step step)
+{
+ switch (step) {
+ case SUSPEND_FREEZE:
+ return "freeze";
+ case SUSPEND_PREPARE:
+ return "prepare";
+ case SUSPEND_SUSPEND:
+ return "suspend";
+ case SUSPEND_SUSPEND_NOIRQ:
+ return "suspend_noirq";
+ case SUSPEND_RESUME_NOIRQ:
+ return "resume_noirq";
+ case SUSPEND_RESUME:
+ return "resume";
+ default:
+ return "";
+ }
+}
+
+static int suspend_stats_show(struct seq_file *s, void *unused)
+{
+ int i, index, last_dev, last_errno, last_step;
+
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
+ last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+ last_errno %= REC_FAILED_NUM;
+ last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+ last_step %= REC_FAILED_NUM;
+ seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+ "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+ "success", suspend_stats.success,
+ "fail", suspend_stats.fail,
+ "failed_freeze", suspend_stats.failed_freeze,
+ "failed_prepare", suspend_stats.failed_prepare,
+ "failed_suspend", suspend_stats.failed_suspend,
+ "failed_suspend_noirq",
+ suspend_stats.failed_suspend_noirq,
+ "failed_resume", suspend_stats.failed_resume,
+ "failed_resume_noirq",
+ suspend_stats.failed_resume_noirq);
+ seq_printf(s, "failures:\n last_failed_dev:\t%-s\n",
+ suspend_stats.failed_devs[last_dev]);
+ for (i = 1; i < REC_FAILED_NUM; i++) {
+ index = last_dev + REC_FAILED_NUM - i;
+ index %= REC_FAILED_NUM;
+ seq_printf(s, "\t\t\t%-s\n",
+ suspend_stats.failed_devs[index]);
+ }
+ seq_printf(s, " last_failed_errno:\t%-d\n",
+ suspend_stats.errno[last_errno]);
+ for (i = 1; i < REC_FAILED_NUM; i++) {
+ index = last_errno + REC_FAILED_NUM - i;
+ index %= REC_FAILED_NUM;
+ seq_printf(s, "\t\t\t%-d\n",
+ suspend_stats.errno[index]);
+ }
+ seq_printf(s, " last_failed_step:\t%-s\n",
+ suspend_step_name(
+ suspend_stats.failed_steps[last_step]));
+ for (i = 1; i < REC_FAILED_NUM; i++) {
+ index = last_step + REC_FAILED_NUM - i;
+ index %= REC_FAILED_NUM;
+ seq_printf(s, "\t\t\t%-s\n",
+ suspend_step_name(
+ suspend_stats.failed_steps[index]));
+ }
+
+ return 0;
+}
+
+static int suspend_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, suspend_stats_show, NULL);
+}
+
+static const struct file_operations suspend_stats_operations = {
+ .open = suspend_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init pm_debugfs_init(void)
+{
+ debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
+ NULL, NULL, &suspend_stats_operations);
+ return 0;
+}
+
+late_initcall(pm_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
+
#endif /* CONFIG_PM_SLEEP */
struct kobject *power_kobj;
@@ -194,6 +291,11 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
}
if (state < PM_SUSPEND_MAX && *s)
error = enter_state(state);
+ if (error) {
+ suspend_stats.fail++;
+ dpm_save_failed_errno(error);
+ } else
+ suspend_stats.success++;
#endif
Exit:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a26280..23a2db1ec442 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void);
*/
#define SF_PLATFORM_MODE 1
#define SF_NOCOMPRESS_MODE 2
+#define SF_CRC32_MODE 4
/* kernel/power/hibernate.c */
extern int swsusp_check(void);
@@ -228,7 +229,8 @@ extern int pm_test_level;
#ifdef CONFIG_SUSPEND_FREEZER
static inline int suspend_freeze_processes(void)
{
- return freeze_processes();
+ int error = freeze_processes();
+ return error ? : freeze_kernel_threads();
}
static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9d..addbbe5531bc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only)
}
/**
- * freeze_processes - tell processes to enter the refrigerator
+ * freeze_processes - Signal user space processes to enter the refrigerator.
*/
int freeze_processes(void)
{
@@ -143,20 +143,30 @@ int freeze_processes(void)
printk("Freezing user space processes ... ");
error = try_to_freeze_tasks(true);
- if (error)
- goto Exit;
- printk("done.\n");
+ if (!error) {
+ printk("done.");
+ oom_killer_disable();
+ }
+ printk("\n");
+ BUG_ON(in_atomic());
+
+ return error;
+}
+
+/**
+ * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ */
+int freeze_kernel_threads(void)
+{
+ int error;
printk("Freezing remaining freezable tasks ... ");
error = try_to_freeze_tasks(false);
- if (error)
- goto Exit;
- printk("done.");
+ if (!error)
+ printk("done.");
- oom_killer_disable();
- Exit:
- BUG_ON(in_atomic());
printk("\n");
+ BUG_ON(in_atomic());
return error;
}
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c
index 37f05d0f0793..1c1797dd1d1d 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/power/qos.c
@@ -29,7 +29,7 @@
/*#define DEBUG*/
-#include <linux/pm_qos_params.h>
+#include <linux/pm_qos.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
@@ -45,62 +45,57 @@
#include <linux/uaccess.h>
/*
- * locking rule: all changes to requests or notifiers lists
+ * locking rule: all changes to constraints or notifiers lists
* or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
* held, taken with _irqsave. One lock to rule them all
*/
-enum pm_qos_type {
- PM_QOS_MAX, /* return the largest value */
- PM_QOS_MIN /* return the smallest value */
-};
-
-/*
- * Note: The lockless read path depends on the CPU accessing
- * target_value atomically. Atomic access is only guaranteed on all CPU
- * types linux supports for 32 bit quantites
- */
struct pm_qos_object {
- struct plist_head requests;
- struct blocking_notifier_head *notifiers;
+ struct pm_qos_constraints *constraints;
struct miscdevice pm_qos_power_miscdev;
char *name;
- s32 target_value; /* Do not change to 64 bit */
- s32 default_value;
- enum pm_qos_type type;
};
static DEFINE_SPINLOCK(pm_qos_lock);
static struct pm_qos_object null_pm_qos;
+
static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_object cpu_dma_pm_qos = {
- .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
- .notifiers = &cpu_dma_lat_notifier,
- .name = "cpu_dma_latency",
+static struct pm_qos_constraints cpu_dma_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
.target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
.type = PM_QOS_MIN,
+ .notifiers = &cpu_dma_lat_notifier,
+};
+static struct pm_qos_object cpu_dma_pm_qos = {
+ .constraints = &cpu_dma_constraints,
};
static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_object network_lat_pm_qos = {
- .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
- .notifiers = &network_lat_notifier,
- .name = "network_latency",
+static struct pm_qos_constraints network_lat_constraints = {
+ .list = PLIST_HEAD_INIT(network_lat_constraints.list),
.target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
.default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
- .type = PM_QOS_MIN
+ .type = PM_QOS_MIN,
+ .notifiers = &network_lat_notifier,
+};
+static struct pm_qos_object network_lat_pm_qos = {
+ .constraints = &network_lat_constraints,
+ .name = "network_latency",
};
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_object network_throughput_pm_qos = {
- .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
- .notifiers = &network_throughput_notifier,
- .name = "network_throughput",
+static struct pm_qos_constraints network_tput_constraints = {
+ .list = PLIST_HEAD_INIT(network_tput_constraints.list),
.target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
.type = PM_QOS_MAX,
+ .notifiers = &network_throughput_notifier,
+};
+static struct pm_qos_object network_throughput_pm_qos = {
+ .constraints = &network_tput_constraints,
+ .name = "network_throughput",
};
@@ -127,17 +122,17 @@ static const struct file_operations pm_qos_power_fops = {
};
/* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_object *o)
+static inline int pm_qos_get_value(struct pm_qos_constraints *c)
{
- if (plist_head_empty(&o->requests))
- return o->default_value;
+ if (plist_head_empty(&c->list))
+ return c->default_value;
- switch (o->type) {
+ switch (c->type) {
case PM_QOS_MIN:
- return plist_first(&o->requests)->prio;
+ return plist_first(&c->list)->prio;
case PM_QOS_MAX:
- return plist_last(&o->requests)->prio;
+ return plist_last(&c->list)->prio;
default:
/* runtime check for not using enum */
@@ -145,69 +140,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
}
}
-static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
{
- return o->target_value;
+ return c->target_value;
}
-static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
{
- o->target_value = value;
+ c->target_value = value;
}
-static void update_target(struct pm_qos_object *o, struct plist_node *node,
- int del, int value)
+/**
+ * pm_qos_update_target - manages the constraints list and calls the notifiers
+ * if needed
+ * @c: constraints data struct
+ * @node: request to add to the list, to update or to remove
+ * @action: action to take on the constraints list
+ * @value: value of the request to add or update
+ *
+ * This function returns 1 if the aggregated constraint value has changed, 0
+ * otherwise.
+ */
+int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
+ enum pm_qos_req_action action, int value)
{
unsigned long flags;
- int prev_value, curr_value;
+ int prev_value, curr_value, new_value;
spin_lock_irqsave(&pm_qos_lock, flags);
- prev_value = pm_qos_get_value(o);
- /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
- if (value != PM_QOS_DEFAULT_VALUE) {
+ prev_value = pm_qos_get_value(c);
+ if (value == PM_QOS_DEFAULT_VALUE)
+ new_value = c->default_value;
+ else
+ new_value = value;
+
+ switch (action) {
+ case PM_QOS_REMOVE_REQ:
+ plist_del(node, &c->list);
+ break;
+ case PM_QOS_UPDATE_REQ:
/*
* to change the list, we atomically remove, reinit
* with new value and add, then see if the extremal
* changed
*/
- plist_del(node, &o->requests);
- plist_node_init(node, value);
- plist_add(node, &o->requests);
- } else if (del) {
- plist_del(node, &o->requests);
- } else {
- plist_add(node, &o->requests);
+ plist_del(node, &c->list);
+ case PM_QOS_ADD_REQ:
+ plist_node_init(node, new_value);
+ plist_add(node, &c->list);
+ break;
+ default:
+ /* no action */
+ ;
}
- curr_value = pm_qos_get_value(o);
- pm_qos_set_value(o, curr_value);
+
+ curr_value = pm_qos_get_value(c);
+ pm_qos_set_value(c, curr_value);
+
spin_unlock_irqrestore(&pm_qos_lock, flags);
- if (prev_value != curr_value)
- blocking_notifier_call_chain(o->notifiers,
+ if (prev_value != curr_value) {
+ blocking_notifier_call_chain(c->notifiers,
(unsigned long)curr_value,
NULL);
-}
-
-static int register_pm_qos_misc(struct pm_qos_object *qos)
-{
- qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
- qos->pm_qos_power_miscdev.name = qos->name;
- qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-
- return misc_register(&qos->pm_qos_power_miscdev);
-}
-
-static int find_pm_qos_object_by_minor(int minor)
-{
- int pm_qos_class;
-
- for (pm_qos_class = 0;
- pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
- if (minor ==
- pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
- return pm_qos_class;
+ return 1;
+ } else {
+ return 0;
}
- return -1;
}
/**
@@ -218,11 +217,11 @@ static int find_pm_qos_object_by_minor(int minor)
*/
int pm_qos_request(int pm_qos_class)
{
- return pm_qos_read_value(pm_qos_array[pm_qos_class]);
+ return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
}
EXPORT_SYMBOL_GPL(pm_qos_request);
-int pm_qos_request_active(struct pm_qos_request_list *req)
+int pm_qos_request_active(struct pm_qos_request *req)
{
return req->pm_qos_class != 0;
}
@@ -230,40 +229,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
/**
* pm_qos_add_request - inserts new qos request into the list
- * @dep: pointer to a preallocated handle
+ * @req: pointer to a preallocated handle
* @pm_qos_class: identifies which list of qos request to use
* @value: defines the qos request
*
* This function inserts a new entry in the pm_qos_class list of requested qos
* performance characteristics. It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request_list
+ * for the pm_qos_class of parameters and initializes the pm_qos_request
* handle. Caller needs to save this handle for later use in updates and
* removal.
*/
-void pm_qos_add_request(struct pm_qos_request_list *dep,
+void pm_qos_add_request(struct pm_qos_request *req,
int pm_qos_class, s32 value)
{
- struct pm_qos_object *o = pm_qos_array[pm_qos_class];
- int new_value;
+ if (!req) /*guard against callers passing in null */
+ return;
- if (pm_qos_request_active(dep)) {
+ if (pm_qos_request_active(req)) {
WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
return;
}
- if (value == PM_QOS_DEFAULT_VALUE)
- new_value = o->default_value;
- else
- new_value = value;
- plist_node_init(&dep->list, new_value);
- dep->pm_qos_class = pm_qos_class;
- update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
+ req->pm_qos_class = pm_qos_class;
+ pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
+ &req->node, PM_QOS_ADD_REQ, value);
}
EXPORT_SYMBOL_GPL(pm_qos_add_request);
/**
* pm_qos_update_request - modifies an existing qos request
- * @pm_qos_req : handle to list element holding a pm_qos request to use
+ * @req : handle to list element holding a pm_qos request to use
* @value: defines the qos request
*
* Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +266,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
*
* Attempts are made to make this code callable on hot code paths.
*/
-void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+void pm_qos_update_request(struct pm_qos_request *req,
s32 new_value)
{
- s32 temp;
- struct pm_qos_object *o;
-
- if (!pm_qos_req) /*guard against callers passing in null */
+ if (!req) /*guard against callers passing in null */
return;
- if (!pm_qos_request_active(pm_qos_req)) {
+ if (!pm_qos_request_active(req)) {
WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
return;
}
- o = pm_qos_array[pm_qos_req->pm_qos_class];
-
- if (new_value == PM_QOS_DEFAULT_VALUE)
- temp = o->default_value;
- else
- temp = new_value;
-
- if (temp != pm_qos_req->list.prio)
- update_target(o, &pm_qos_req->list, 0, temp);
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
/**
* pm_qos_remove_request - modifies an existing qos request
- * @pm_qos_req: handle to request list element
+ * @req: handle to request list element
*
- * Will remove pm qos request from the list of requests and
+ * Will remove pm qos request from the list of constraints and
* recompute the current target value for the pm_qos_class. Call this
* on slow code paths.
*/
-void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
+void pm_qos_remove_request(struct pm_qos_request *req)
{
- struct pm_qos_object *o;
-
- if (pm_qos_req == NULL)
+ if (!req) /*guard against callers passing in null */
return;
/* silent return to keep pcm code cleaner */
- if (!pm_qos_request_active(pm_qos_req)) {
+ if (!pm_qos_request_active(req)) {
WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
return;
}
- o = pm_qos_array[pm_qos_req->pm_qos_class];
- update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
- memset(pm_qos_req, 0, sizeof(*pm_qos_req));
+ pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_REMOVE_REQ,
+ PM_QOS_DEFAULT_VALUE);
+ memset(req, 0, sizeof(*req));
}
EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -337,7 +323,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
int retval;
retval = blocking_notifier_chain_register(
- pm_qos_array[pm_qos_class]->notifiers, notifier);
+ pm_qos_array[pm_qos_class]->constraints->notifiers,
+ notifier);
return retval;
}
@@ -356,19 +343,43 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
int retval;
retval = blocking_notifier_chain_unregister(
- pm_qos_array[pm_qos_class]->notifiers, notifier);
+ pm_qos_array[pm_qos_class]->constraints->notifiers,
+ notifier);
return retval;
}
EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+/* User space interface to PM QoS classes via misc devices */
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+ qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+ qos->pm_qos_power_miscdev.name = qos->name;
+ qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+
+ return misc_register(&qos->pm_qos_power_miscdev);
+}
+
+static int find_pm_qos_object_by_minor(int minor)
+{
+ int pm_qos_class;
+
+ for (pm_qos_class = 0;
+ pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+ if (minor ==
+ pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+ return pm_qos_class;
+ }
+ return -1;
+}
+
static int pm_qos_power_open(struct inode *inode, struct file *filp)
{
long pm_qos_class;
pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
if (pm_qos_class >= 0) {
- struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
+ struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
if (!req)
return -ENOMEM;
@@ -383,7 +394,7 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
static int pm_qos_power_release(struct inode *inode, struct file *filp)
{
- struct pm_qos_request_list *req;
+ struct pm_qos_request *req;
req = filp->private_data;
pm_qos_remove_request(req);
@@ -398,17 +409,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
{
s32 value;
unsigned long flags;
- struct pm_qos_object *o;
- struct pm_qos_request_list *pm_qos_req = filp->private_data;
+ struct pm_qos_request *req = filp->private_data;
- if (!pm_qos_req)
+ if (!req)
return -EINVAL;
- if (!pm_qos_request_active(pm_qos_req))
+ if (!pm_qos_request_active(req))
return -EINVAL;
- o = pm_qos_array[pm_qos_req->pm_qos_class];
spin_lock_irqsave(&pm_qos_lock, flags);
- value = pm_qos_get_value(o);
+ value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
spin_unlock_irqrestore(&pm_qos_lock, flags);
return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
@@ -418,7 +427,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos)
{
s32 value;
- struct pm_qos_request_list *pm_qos_req;
+ struct pm_qos_request *req;
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +458,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
return -EINVAL;
}
- pm_qos_req = filp->private_data;
- pm_qos_update_request(pm_qos_req, value);
+ req = filp->private_data;
+ pm_qos_update_request(req, value);
return count;
}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d6..cbe2c1441392 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void)
count += highmem;
count -= totalreserve_pages;
+ /* Add number of pages required for page keys (s390 only). */
+ size += page_key_additional_pages(saveable);
+
/* Compute the maximum number of saveable pages to leave in memory. */
max_size = (count - (size + PAGES_FOR_IO)) / 2
- 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
buf[j] = memory_bm_next_pfn(bm);
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
+ /* Save page key for data page (s390 only). */
+ page_key_read(buf + j);
}
}
@@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
if (unlikely(buf[j] == BM_END_OF_MAP))
break;
+ /* Extract and buffer page key for data page (s390 only). */
+ page_key_memorize(buf + j);
+
if (memory_bm_pfn_present(bm, buf[j]))
memory_bm_set_bit(bm, buf[j]);
else
@@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
if (error)
return error;
+ /* Allocate buffer for page keys. */
+ error = page_key_alloc(nr_copy_pages);
+ if (error)
+ return error;
+
} else if (handle->cur <= nr_meta_pages + 1) {
error = unpack_orig_pfns(buffer, &copy_bm);
if (error)
@@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
}
} else {
copy_last_highmem_page();
+ /* Restore page key for data page (s390 only). */
+ page_key_write(handle->buffer);
handle->buffer = get_buffer(&orig_bm, &ca);
if (IS_ERR(handle->buffer))
return PTR_ERR(handle->buffer);
@@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
void snapshot_write_finalize(struct snapshot_handle *handle)
{
copy_last_highmem_page();
+ /* Restore page key for data page (s390 only). */
+ page_key_write(handle->buffer);
+ page_key_free();
/* Free only if we have loaded the image entirely */
if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208f..fdd4263b995d 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -104,7 +104,10 @@ static int suspend_prepare(void)
goto Finish;
error = suspend_freeze_processes();
- if (!error)
+ if (error) {
+ suspend_stats.failed_freeze++;
+ dpm_save_failed_step(SUSPEND_FREEZE);
+ } else
return 0;
suspend_thaw_processes();
@@ -315,8 +318,16 @@ int enter_state(suspend_state_t state)
*/
int pm_suspend(suspend_state_t state)
{
- if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
- return enter_state(state);
+ int ret;
+ if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
+ ret = enter_state(state);
+ if (ret) {
+ suspend_stats.fail++;
+ dpm_save_failed_errno(ret);
+ } else
+ suspend_stats.success++;
+ return ret;
+ }
return -EINVAL;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee3..11a594c4ba25 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -27,6 +27,10 @@
#include <linux/slab.h>
#include <linux/lzo.h>
#include <linux/vmalloc.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
#include "power.h"
@@ -43,8 +47,7 @@
* allocated and populated one at a time, so we only need one memory
* page to set up the entire structure.
*
- * During resume we also only need to use one swap_map_page structure
- * at a time.
+ * During resume we pick up all swap_map_page structures into a list.
*/
#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +57,11 @@ struct swap_map_page {
sector_t next_swap;
};
+struct swap_map_page_list {
+ struct swap_map_page *map;
+ struct swap_map_page_list *next;
+};
+
/**
* The swap_map_handle structure is used for handling swap in
* a file-alike way
@@ -61,13 +69,18 @@ struct swap_map_page {
struct swap_map_handle {
struct swap_map_page *cur;
+ struct swap_map_page_list *maps;
sector_t cur_swap;
sector_t first_sector;
unsigned int k;
+ unsigned long nr_free_pages, written;
+ u32 crc32;
};
struct swsusp_header {
- char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
+ char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
+ sizeof(u32)];
+ u32 crc32;
sector_t image;
unsigned int flags; /* Flags to pass to the "boot" kernel */
char orig_sig[10];
@@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
swsusp_header->image = handle->first_sector;
swsusp_header->flags = flags;
+ if (flags & SF_CRC32_MODE)
+ swsusp_header->crc32 = handle->crc32;
error = hib_bio_write_page(swsusp_resume_block,
swsusp_header, NULL);
} else {
@@ -245,6 +260,7 @@ static int swsusp_swap_check(void)
static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
{
void *src;
+ int ret;
if (!offset)
return -ENOSPC;
@@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
if (src) {
copy_page(src, buf);
} else {
- WARN_ON_ONCE(1);
- bio_chain = NULL; /* Go synchronous */
- src = buf;
+ ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
+ if (ret)
+ return ret;
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ if (src) {
+ copy_page(src, buf);
+ } else {
+ WARN_ON_ONCE(1);
+ bio_chain = NULL; /* Go synchronous */
+ src = buf;
+ }
}
} else {
src = buf;
@@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
goto err_rel;
}
handle->k = 0;
+ handle->nr_free_pages = nr_free_pages() >> 1;
+ handle->written = 0;
handle->first_sector = handle->cur_swap;
return 0;
err_rel:
@@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
return error;
handle->cur->entries[handle->k++] = offset;
if (handle->k >= MAP_PAGE_ENTRIES) {
- error = hib_wait_on_bio_chain(bio_chain);
- if (error)
- goto out;
offset = alloc_swapdev_block(root_swap);
if (!offset)
return -ENOSPC;
handle->cur->next_swap = offset;
- error = write_page(handle->cur, handle->cur_swap, NULL);
+ error = write_page(handle->cur, handle->cur_swap, bio_chain);
if (error)
goto out;
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
}
+ if (bio_chain && ++handle->written > handle->nr_free_pages) {
+ error = hib_wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ handle->written = 0;
+ }
out:
return error;
}
@@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
LZO_HEADER, PAGE_SIZE)
#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE)
+/* Maximum number of threads for compression/decompression. */
+#define LZO_THREADS 3
+
+/* Maximum number of pages for read buffering. */
+#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
+
+
/**
* save_image - save the suspend image data
*/
@@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
+/**
+ * Structure used for CRC32.
+ */
+struct crc_data {
+ struct task_struct *thr; /* thread */
+ atomic_t ready; /* ready to start flag */
+ atomic_t stop; /* ready to stop flag */
+ unsigned run_threads; /* nr current threads */
+ wait_queue_head_t go; /* start crc update */
+ wait_queue_head_t done; /* crc update done */
+ u32 *crc32; /* points to handle's crc32 */
+ size_t *unc_len[LZO_THREADS]; /* uncompressed lengths */
+ unsigned char *unc[LZO_THREADS]; /* uncompressed data */
+};
+
+/**
+ * CRC32 update function that runs in its own thread.
+ */
+static int crc32_threadfn(void *data)
+{
+ struct crc_data *d = data;
+ unsigned i;
+
+ while (1) {
+ wait_event(d->go, atomic_read(&d->ready) ||
+ kthread_should_stop());
+ if (kthread_should_stop()) {
+ d->thr = NULL;
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ break;
+ }
+ atomic_set(&d->ready, 0);
+
+ for (i = 0; i < d->run_threads; i++)
+ *d->crc32 = crc32_le(*d->crc32,
+ d->unc[i], *d->unc_len[i]);
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ }
+ return 0;
+}
+/**
+ * Structure used for LZO data compression.
+ */
+struct cmp_data {
+ struct task_struct *thr; /* thread */
+ atomic_t ready; /* ready to start flag */
+ atomic_t stop; /* ready to stop flag */
+ int ret; /* return code */
+ wait_queue_head_t go; /* start compression */
+ wait_queue_head_t done; /* compression done */
+ size_t unc_len; /* uncompressed length */
+ size_t cmp_len; /* compressed length */
+ unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+ unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
+};
+
+/**
+ * Compression function that runs in its own thread.
+ */
+static int lzo_compress_threadfn(void *data)
+{
+ struct cmp_data *d = data;
+
+ while (1) {
+ wait_event(d->go, atomic_read(&d->ready) ||
+ kthread_should_stop());
+ if (kthread_should_stop()) {
+ d->thr = NULL;
+ d->ret = -1;
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ break;
+ }
+ atomic_set(&d->ready, 0);
+
+ d->ret = lzo1x_1_compress(d->unc, d->unc_len,
+ d->cmp + LZO_HEADER, &d->cmp_len,
+ d->wrk);
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ }
+ return 0;
+}
/**
* save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
struct bio *bio;
struct timeval start;
struct timeval stop;
- size_t off, unc_len, cmp_len;
- unsigned char *unc, *cmp, *wrk, *page;
+ size_t off;
+ unsigned thr, run_threads, nr_threads;
+ unsigned char *page = NULL;
+ struct cmp_data *data = NULL;
+ struct crc_data *crc = NULL;
+
+ /*
+ * We'll limit the number of threads for compression to limit memory
+ * footprint.
+ */
+ nr_threads = num_online_cpus() - 1;
+ nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_clean;
}
- wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
- if (!wrk) {
- printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
- free_page((unsigned long)page);
- return -ENOMEM;
+ data = vmalloc(sizeof(*data) * nr_threads);
+ if (!data) {
+ printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ ret = -ENOMEM;
+ goto out_clean;
}
+ for (thr = 0; thr < nr_threads; thr++)
+ memset(&data[thr], 0, offsetof(struct cmp_data, go));
- unc = vmalloc(LZO_UNC_SIZE);
- if (!unc) {
- printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
- vfree(wrk);
- free_page((unsigned long)page);
- return -ENOMEM;
+ crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ if (!crc) {
+ printk(KERN_ERR "PM: Failed to allocate crc\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ memset(crc, 0, offsetof(struct crc_data, go));
+
+ /*
+ * Start the compression threads.
+ */
+ for (thr = 0; thr < nr_threads; thr++) {
+ init_waitqueue_head(&data[thr].go);
+ init_waitqueue_head(&data[thr].done);
+
+ data[thr].thr = kthread_run(lzo_compress_threadfn,
+ &data[thr],
+ "image_compress/%u", thr);
+ if (IS_ERR(data[thr].thr)) {
+ data[thr].thr = NULL;
+ printk(KERN_ERR
+ "PM: Cannot start compression threads\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
}
- cmp = vmalloc(LZO_CMP_SIZE);
- if (!cmp) {
- printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
- vfree(unc);
- vfree(wrk);
- free_page((unsigned long)page);
- return -ENOMEM;
+ /*
+ * Adjust number of free pages after all allocations have been done.
+ * We don't want to run out of pages when writing.
+ */
+ handle->nr_free_pages = nr_free_pages() >> 1;
+
+ /*
+ * Start the CRC32 thread.
+ */
+ init_waitqueue_head(&crc->go);
+ init_waitqueue_head(&crc->done);
+
+ handle->crc32 = 0;
+ crc->crc32 = &handle->crc32;
+ for (thr = 0; thr < nr_threads; thr++) {
+ crc->unc[thr] = data[thr].unc;
+ crc->unc_len[thr] = &data[thr].unc_len;
+ }
+
+ crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+ if (IS_ERR(crc->thr)) {
+ crc->thr = NULL;
+ printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ ret = -ENOMEM;
+ goto out_clean;
}
printk(KERN_INFO
+ "PM: Using %u thread(s) for compression.\n"
"PM: Compressing and saving image data (%u pages) ... ",
- nr_to_write);
+ nr_threads, nr_to_write);
m = nr_to_write / 100;
if (!m)
m = 1;
@@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
bio = NULL;
do_gettimeofday(&start);
for (;;) {
- for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
- ret = snapshot_read_next(snapshot);
- if (ret < 0)
- goto out_finish;
-
- if (!ret)
+ for (thr = 0; thr < nr_threads; thr++) {
+ for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+ ret = snapshot_read_next(snapshot);
+ if (ret < 0)
+ goto out_finish;
+
+ if (!ret)
+ break;
+
+ memcpy(data[thr].unc + off,
+ data_of(*snapshot), PAGE_SIZE);
+
+ if (!(nr_pages % m))
+ printk(KERN_CONT "\b\b\b\b%3d%%",
+ nr_pages / m);
+ nr_pages++;
+ }
+ if (!off)
break;
- memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+ data[thr].unc_len = off;
- if (!(nr_pages % m))
- printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
+ atomic_set(&data[thr].ready, 1);
+ wake_up(&data[thr].go);
}
- if (!off)
+ if (!thr)
break;
- unc_len = off;
- ret = lzo1x_1_compress(unc, unc_len,
- cmp + LZO_HEADER, &cmp_len, wrk);
- if (ret < 0) {
- printk(KERN_ERR "PM: LZO compression failed\n");
- break;
- }
+ crc->run_threads = thr;
+ atomic_set(&crc->ready, 1);
+ wake_up(&crc->go);
- if (unlikely(!cmp_len ||
- cmp_len > lzo1x_worst_compress(unc_len))) {
- printk(KERN_ERR "PM: Invalid LZO compressed length\n");
- ret = -1;
- break;
- }
+ for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+ wait_event(data[thr].done,
+ atomic_read(&data[thr].stop));
+ atomic_set(&data[thr].stop, 0);
- *(size_t *)cmp = cmp_len;
+ ret = data[thr].ret;
- /*
- * Given we are writing one page at a time to disk, we copy
- * that much from the buffer, although the last bit will likely
- * be smaller than full page. This is OK - we saved the length
- * of the compressed data, so any garbage at the end will be
- * discarded when we read it.
- */
- for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
- memcpy(page, cmp + off, PAGE_SIZE);
+ if (ret < 0) {
+ printk(KERN_ERR "PM: LZO compression failed\n");
+ goto out_finish;
+ }
- ret = swap_write_page(handle, page, &bio);
- if (ret)
+ if (unlikely(!data[thr].cmp_len ||
+ data[thr].cmp_len >
+ lzo1x_worst_compress(data[thr].unc_len))) {
+ printk(KERN_ERR
+ "PM: Invalid LZO compressed length\n");
+ ret = -1;
goto out_finish;
+ }
+
+ *(size_t *)data[thr].cmp = data[thr].cmp_len;
+
+ /*
+ * Given we are writing one page at a time to disk, we
+ * copy that much from the buffer, although the last
+ * bit will likely be smaller than full page. This is
+ * OK - we saved the length of the compressed data, so
+ * any garbage at the end will be discarded when we
+ * read it.
+ */
+ for (off = 0;
+ off < LZO_HEADER + data[thr].cmp_len;
+ off += PAGE_SIZE) {
+ memcpy(page, data[thr].cmp + off, PAGE_SIZE);
+
+ ret = swap_write_page(handle, page, &bio);
+ if (ret)
+ goto out_finish;
+ }
}
+
+ wait_event(crc->done, atomic_read(&crc->stop));
+ atomic_set(&crc->stop, 0);
}
out_finish:
@@ -536,16 +737,25 @@ out_finish:
do_gettimeofday(&stop);
if (!ret)
ret = err2;
- if (!ret)
+ if (!ret) {
printk(KERN_CONT "\b\b\b\bdone\n");
- else
+ } else {
printk(KERN_CONT "\n");
+ }
swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
-
- vfree(cmp);
- vfree(unc);
- vfree(wrk);
- free_page((unsigned long)page);
+out_clean:
+ if (crc) {
+ if (crc->thr)
+ kthread_stop(crc->thr);
+ kfree(crc);
+ }
+ if (data) {
+ for (thr = 0; thr < nr_threads; thr++)
+ if (data[thr].thr)
+ kthread_stop(data[thr].thr);
+ vfree(data);
+ }
+ if (page) free_page((unsigned long)page);
return ret;
}
@@ -625,8 +835,15 @@ out_finish:
static void release_swap_reader(struct swap_map_handle *handle)
{
- if (handle->cur)
- free_page((unsigned long)handle->cur);
+ struct swap_map_page_list *tmp;
+
+ while (handle->maps) {
+ if (handle->maps->map)
+ free_page((unsigned long)handle->maps->map);
+ tmp = handle->maps;
+ handle->maps = handle->maps->next;
+ kfree(tmp);
+ }
handle->cur = NULL;
}
@@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
unsigned int *flags_p)
{
int error;
+ struct swap_map_page_list *tmp, *last;
+ sector_t offset;
*flags_p = swsusp_header->flags;
if (!swsusp_header->image) /* how can this happen? */
return -EINVAL;
- handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
- if (!handle->cur)
- return -ENOMEM;
+ handle->cur = NULL;
+ last = handle->maps = NULL;
+ offset = swsusp_header->image;
+ while (offset) {
+ tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+ if (!tmp) {
+ release_swap_reader(handle);
+ return -ENOMEM;
+ }
+ memset(tmp, 0, sizeof(*tmp));
+ if (!handle->maps)
+ handle->maps = tmp;
+ if (last)
+ last->next = tmp;
+ last = tmp;
+
+ tmp->map = (struct swap_map_page *)
+ __get_free_page(__GFP_WAIT | __GFP_HIGH);
+ if (!tmp->map) {
+ release_swap_reader(handle);
+ return -ENOMEM;
+ }
- error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
- if (error) {
- release_swap_reader(handle);
- return error;
+ error = hib_bio_read_page(offset, tmp->map, NULL);
+ if (error) {
+ release_swap_reader(handle);
+ return error;
+ }
+ offset = tmp->map->next_swap;
}
handle->k = 0;
+ handle->cur = handle->maps->map;
return 0;
}
@@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
{
sector_t offset;
int error;
+ struct swap_map_page_list *tmp;
if (!handle->cur)
return -EINVAL;
@@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
if (error)
return error;
if (++handle->k >= MAP_PAGE_ENTRIES) {
- error = hib_wait_on_bio_chain(bio_chain);
handle->k = 0;
- offset = handle->cur->next_swap;
- if (!offset)
+ free_page((unsigned long)handle->maps->map);
+ tmp = handle->maps;
+ handle->maps = handle->maps->next;
+ kfree(tmp);
+ if (!handle->maps)
release_swap_reader(handle);
- else if (!error)
- error = hib_bio_read_page(offset, handle->cur, NULL);
+ else
+ handle->cur = handle->maps->map;
}
return error;
}
@@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle,
unsigned int nr_to_read)
{
unsigned int m;
- int error = 0;
+ int ret = 0;
struct timeval start;
struct timeval stop;
struct bio *bio;
@@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle,
bio = NULL;
do_gettimeofday(&start);
for ( ; ; ) {
- error = snapshot_write_next(snapshot);
- if (error <= 0)
+ ret = snapshot_write_next(snapshot);
+ if (ret <= 0)
break;
- error = swap_read_page(handle, data_of(*snapshot), &bio);
- if (error)
+ ret = swap_read_page(handle, data_of(*snapshot), &bio);
+ if (ret)
break;
if (snapshot->sync_read)
- error = hib_wait_on_bio_chain(&bio);
- if (error)
+ ret = hib_wait_on_bio_chain(&bio);
+ if (ret)
break;
if (!(nr_pages % m))
printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle,
}
err2 = hib_wait_on_bio_chain(&bio);
do_gettimeofday(&stop);
- if (!error)
- error = err2;
- if (!error) {
+ if (!ret)
+ ret = err2;
+ if (!ret) {
printk("\b\b\b\bdone\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
- error = -ENODATA;
+ ret = -ENODATA;
} else
printk("\n");
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
- return error;
+ return ret;
+}
+
+/**
+ * Structure used for LZO data decompression.
+ */
+struct dec_data {
+ struct task_struct *thr; /* thread */
+ atomic_t ready; /* ready to start flag */
+ atomic_t stop; /* ready to stop flag */
+ int ret; /* return code */
+ wait_queue_head_t go; /* start decompression */
+ wait_queue_head_t done; /* decompression done */
+ size_t unc_len; /* uncompressed length */
+ size_t cmp_len; /* compressed length */
+ unsigned char unc[LZO_UNC_SIZE]; /* uncompressed buffer */
+ unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
+};
+
+/**
+ * Deompression function that runs in its own thread.
+ */
+static int lzo_decompress_threadfn(void *data)
+{
+ struct dec_data *d = data;
+
+ while (1) {
+ wait_event(d->go, atomic_read(&d->ready) ||
+ kthread_should_stop());
+ if (kthread_should_stop()) {
+ d->thr = NULL;
+ d->ret = -1;
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ break;
+ }
+ atomic_set(&d->ready, 0);
+
+ d->unc_len = LZO_UNC_SIZE;
+ d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
+ d->unc, &d->unc_len);
+ atomic_set(&d->stop, 1);
+ wake_up(&d->done);
+ }
+ return 0;
}
/**
@@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned int nr_to_read)
{
unsigned int m;
- int error = 0;
+ int ret = 0;
+ int eof = 0;
struct bio *bio;
struct timeval start;
struct timeval stop;
unsigned nr_pages;
- size_t i, off, unc_len, cmp_len;
- unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
-
- for (i = 0; i < LZO_CMP_PAGES; i++) {
- page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
- if (!page[i]) {
- printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ size_t off;
+ unsigned i, thr, run_threads, nr_threads;
+ unsigned ring = 0, pg = 0, ring_size = 0,
+ have = 0, want, need, asked = 0;
+ unsigned long read_pages;
+ unsigned char **page = NULL;
+ struct dec_data *data = NULL;
+ struct crc_data *crc = NULL;
+
+ /*
+ * We'll limit the number of threads for decompression to limit memory
+ * footprint.
+ */
+ nr_threads = num_online_cpus() - 1;
+ nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+
+ page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+ if (!page) {
+ printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
- while (i)
- free_page((unsigned long)page[--i]);
+ data = vmalloc(sizeof(*data) * nr_threads);
+ if (!data) {
+ printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ for (thr = 0; thr < nr_threads; thr++)
+ memset(&data[thr], 0, offsetof(struct dec_data, go));
- return -ENOMEM;
+ crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ if (!crc) {
+ printk(KERN_ERR "PM: Failed to allocate crc\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
+ memset(crc, 0, offsetof(struct crc_data, go));
+
+ /*
+ * Start the decompression threads.
+ */
+ for (thr = 0; thr < nr_threads; thr++) {
+ init_waitqueue_head(&data[thr].go);
+ init_waitqueue_head(&data[thr].done);
+
+ data[thr].thr = kthread_run(lzo_decompress_threadfn,
+ &data[thr],
+ "image_decompress/%u", thr);
+ if (IS_ERR(data[thr].thr)) {
+ data[thr].thr = NULL;
+ printk(KERN_ERR
+ "PM: Cannot start decompression threads\n");
+ ret = -ENOMEM;
+ goto out_clean;
}
}
- unc = vmalloc(LZO_UNC_SIZE);
- if (!unc) {
- printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
-
- for (i = 0; i < LZO_CMP_PAGES; i++)
- free_page((unsigned long)page[i]);
-
- return -ENOMEM;
+ /*
+ * Start the CRC32 thread.
+ */
+ init_waitqueue_head(&crc->go);
+ init_waitqueue_head(&crc->done);
+
+ handle->crc32 = 0;
+ crc->crc32 = &handle->crc32;
+ for (thr = 0; thr < nr_threads; thr++) {
+ crc->unc[thr] = data[thr].unc;
+ crc->unc_len[thr] = &data[thr].unc_len;
}
- cmp = vmalloc(LZO_CMP_SIZE);
- if (!cmp) {
- printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+ crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+ if (IS_ERR(crc->thr)) {
+ crc->thr = NULL;
+ printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ }
- vfree(unc);
- for (i = 0; i < LZO_CMP_PAGES; i++)
- free_page((unsigned long)page[i]);
+ /*
+ * Adjust number of pages for read buffering, in case we are short.
+ */
+ read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
+ read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
- return -ENOMEM;
+ for (i = 0; i < read_pages; i++) {
+ page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+ __GFP_WAIT | __GFP_HIGH :
+ __GFP_WAIT);
+ if (!page[i]) {
+ if (i < LZO_CMP_PAGES) {
+ ring_size = i;
+ printk(KERN_ERR
+ "PM: Failed to allocate LZO pages\n");
+ ret = -ENOMEM;
+ goto out_clean;
+ } else {
+ break;
+ }
+ }
}
+ want = ring_size = i;
printk(KERN_INFO
+ "PM: Using %u thread(s) for decompression.\n"
"PM: Loading and decompressing image data (%u pages) ... ",
- nr_to_read);
+ nr_threads, nr_to_read);
m = nr_to_read / 100;
if (!m)
m = 1;
@@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
bio = NULL;
do_gettimeofday(&start);
- error = snapshot_write_next(snapshot);
- if (error <= 0)
+ ret = snapshot_write_next(snapshot);
+ if (ret <= 0)
goto out_finish;
- for (;;) {
- error = swap_read_page(handle, page[0], NULL); /* sync */
- if (error)
- break;
-
- cmp_len = *(size_t *)page[0];
- if (unlikely(!cmp_len ||
- cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
- printk(KERN_ERR "PM: Invalid LZO compressed length\n");
- error = -1;
- break;
+ for(;;) {
+ for (i = 0; !eof && i < want; i++) {
+ ret = swap_read_page(handle, page[ring], &bio);
+ if (ret) {
+ /*
+ * On real read error, finish. On end of data,
+ * set EOF flag and just exit the read loop.
+ */
+ if (handle->cur &&
+ handle->cur->entries[handle->k]) {
+ goto out_finish;
+ } else {
+ eof = 1;
+ break;
+ }
+ }
+ if (++ring >= ring_size)
+ ring = 0;
}
+ asked += i;
+ want -= i;
- for (off = PAGE_SIZE, i = 1;
- off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
- error = swap_read_page(handle, page[i], &bio);
- if (error)
+ /*
+ * We are out of data, wait for some more.
+ */
+ if (!have) {
+ if (!asked)
+ break;
+
+ ret = hib_wait_on_bio_chain(&bio);
+ if (ret)
goto out_finish;
+ have += asked;
+ asked = 0;
+ if (eof)
+ eof = 2;
}
- error = hib_wait_on_bio_chain(&bio); /* need all data now */
- if (error)
- goto out_finish;
-
- for (off = 0, i = 0;
- off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
- memcpy(cmp + off, page[i], PAGE_SIZE);
+ if (crc->run_threads) {
+ wait_event(crc->done, atomic_read(&crc->stop));
+ atomic_set(&crc->stop, 0);
+ crc->run_threads = 0;
}
- unc_len = LZO_UNC_SIZE;
- error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
- unc, &unc_len);
- if (error < 0) {
- printk(KERN_ERR "PM: LZO decompression failed\n");
- break;
+ for (thr = 0; have && thr < nr_threads; thr++) {
+ data[thr].cmp_len = *(size_t *)page[pg];
+ if (unlikely(!data[thr].cmp_len ||
+ data[thr].cmp_len >
+ lzo1x_worst_compress(LZO_UNC_SIZE))) {
+ printk(KERN_ERR
+ "PM: Invalid LZO compressed length\n");
+ ret = -1;
+ goto out_finish;
+ }
+
+ need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+ PAGE_SIZE);
+ if (need > have) {
+ if (eof > 1) {
+ ret = -1;
+ goto out_finish;
+ }
+ break;
+ }
+
+ for (off = 0;
+ off < LZO_HEADER + data[thr].cmp_len;
+ off += PAGE_SIZE) {
+ memcpy(data[thr].cmp + off,
+ page[pg], PAGE_SIZE);
+ have--;
+ want++;
+ if (++pg >= ring_size)
+ pg = 0;
+ }
+
+ atomic_set(&data[thr].ready, 1);
+ wake_up(&data[thr].go);
}
- if (unlikely(!unc_len ||
- unc_len > LZO_UNC_SIZE ||
- unc_len & (PAGE_SIZE - 1))) {
- printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
- error = -1;
- break;
+ /*
+ * Wait for more data while we are decompressing.
+ */
+ if (have < LZO_CMP_PAGES && asked) {
+ ret = hib_wait_on_bio_chain(&bio);
+ if (ret)
+ goto out_finish;
+ have += asked;
+ asked = 0;
+ if (eof)
+ eof = 2;
}
- for (off = 0; off < unc_len; off += PAGE_SIZE) {
- memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+ for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
+ wait_event(data[thr].done,
+ atomic_read(&data[thr].stop));
+ atomic_set(&data[thr].stop, 0);
+
+ ret = data[thr].ret;
- if (!(nr_pages % m))
- printk("\b\b\b\b%3d%%", nr_pages / m);
- nr_pages++;
+ if (ret < 0) {
+ printk(KERN_ERR
+ "PM: LZO decompression failed\n");
+ goto out_finish;
+ }
- error = snapshot_write_next(snapshot);
- if (error <= 0)
+ if (unlikely(!data[thr].unc_len ||
+ data[thr].unc_len > LZO_UNC_SIZE ||
+ data[thr].unc_len & (PAGE_SIZE - 1))) {
+ printk(KERN_ERR
+ "PM: Invalid LZO uncompressed length\n");
+ ret = -1;
goto out_finish;
+ }
+
+ for (off = 0;
+ off < data[thr].unc_len; off += PAGE_SIZE) {
+ memcpy(data_of(*snapshot),
+ data[thr].unc + off, PAGE_SIZE);
+
+ if (!(nr_pages % m))
+ printk("\b\b\b\b%3d%%", nr_pages / m);
+ nr_pages++;
+
+ ret = snapshot_write_next(snapshot);
+ if (ret <= 0) {
+ crc->run_threads = thr + 1;
+ atomic_set(&crc->ready, 1);
+ wake_up(&crc->go);
+ goto out_finish;
+ }
+ }
}
+
+ crc->run_threads = thr;
+ atomic_set(&crc->ready, 1);
+ wake_up(&crc->go);
}
out_finish:
+ if (crc->run_threads) {
+ wait_event(crc->done, atomic_read(&crc->stop));
+ atomic_set(&crc->stop, 0);
+ }
do_gettimeofday(&stop);
- if (!error) {
+ if (!ret) {
printk("\b\b\b\bdone\n");
snapshot_write_finalize(snapshot);
if (!snapshot_image_loaded(snapshot))
- error = -ENODATA;
+ ret = -ENODATA;
+ if (!ret) {
+ if (swsusp_header->flags & SF_CRC32_MODE) {
+ if(handle->crc32 != swsusp_header->crc32) {
+ printk(KERN_ERR
+ "PM: Invalid image CRC32!\n");
+ ret = -ENODATA;
+ }
+ }
+ }
} else
printk("\n");
swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-
- vfree(cmp);
- vfree(unc);
- for (i = 0; i < LZO_CMP_PAGES; i++)
+out_clean:
+ for (i = 0; i < ring_size; i++)
free_page((unsigned long)page[i]);
+ if (crc) {
+ if (crc->thr)
+ kthread_stop(crc->thr);
+ kfree(crc);
+ }
+ if (data) {
+ for (thr = 0; thr < nr_threads; thr++)
+ if (data[thr].thr)
+ kthread_stop(data[thr].thr);
+ vfree(data);
+ }
+ if (page) vfree(page);
- return error;
+ return ret;
}
/**
diff --git a/kernel/printk.c b/kernel/printk.c
index 28a40d8171b8..b7da18391c38 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
* It is also used in interesting ways to provide interlocking in
* console_unlock();.
*/
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
#define LOG_BUF_MASK (log_buf_len-1)
#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
return;
}
- spin_lock_irqsave(&logbuf_lock, flags);
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
log_buf_len = new_log_buf_len;
log_buf = new_log_buf;
new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
log_start -= offset;
con_start -= offset;
log_end -= offset;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
pr_info("log_buf_len: %d\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
@@ -365,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
if (error)
goto out;
i = 0;
- spin_lock_irq(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
while (!error && (log_start != log_end) && i < len) {
c = LOG_BUF(log_start);
log_start++;
- spin_unlock_irq(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
error = __put_user(c,buf);
buf++;
i++;
cond_resched();
- spin_lock_irq(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
}
- spin_unlock_irq(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
if (!error)
error = i;
break;
@@ -399,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
count = len;
if (count > log_buf_len)
count = log_buf_len;
- spin_lock_irq(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
if (count > logged_chars)
count = logged_chars;
if (do_clear)
@@ -416,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
if (j + log_buf_len < log_end)
break;
c = LOG_BUF(j);
- spin_unlock_irq(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
error = __put_user(c,&buf[count-1-i]);
cond_resched();
- spin_lock_irq(&logbuf_lock);
+ raw_spin_lock_irq(&logbuf_lock);
}
- spin_unlock_irq(&logbuf_lock);
+ raw_spin_unlock_irq(&logbuf_lock);
if (error)
break;
error = i;
@@ -689,7 +689,7 @@ static void zap_locks(void)
oops_timestamp = jiffies;
/* If a crash is occurring, make sure we can't deadlock */
- spin_lock_init(&logbuf_lock);
+ raw_spin_lock_init(&logbuf_lock);
/* And make sure that we print immediately */
sema_init(&console_sem, 1);
}
@@ -802,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu)
}
}
printk_cpu = UINT_MAX;
- spin_unlock(&logbuf_lock);
if (wake)
up(&console_sem);
+ raw_spin_unlock(&logbuf_lock);
return retval;
}
static const char recursion_bug_msg [] =
@@ -864,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
}
lockdep_off();
- spin_lock(&logbuf_lock);
+ raw_spin_lock(&logbuf_lock);
printk_cpu = this_cpu;
if (recursion_bug) {
@@ -1257,14 +1257,14 @@ void console_unlock(void)
again:
for ( ; ; ) {
- spin_lock_irqsave(&logbuf_lock, flags);
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
wake_klogd |= log_start - log_end;
if (con_start == log_end)
break; /* Nothing to print */
_con_start = con_start;
_log_end = log_end;
con_start = log_end; /* Flush */
- spin_unlock(&logbuf_lock);
+ raw_spin_unlock(&logbuf_lock);
stop_critical_timings(); /* don't trace print latency */
call_console_drivers(_con_start, _log_end);
start_critical_timings();
@@ -1276,7 +1276,7 @@ again:
if (unlikely(exclusive_console))
exclusive_console = NULL;
- spin_unlock(&logbuf_lock);
+ raw_spin_unlock(&logbuf_lock);
up(&console_sem);
@@ -1286,13 +1286,13 @@ again:
* there's a new owner and the console_unlock() from them will do the
* flush, no worries.
*/
- spin_lock(&logbuf_lock);
+ raw_spin_lock(&logbuf_lock);
if (con_start != log_end)
retry = 1;
- spin_unlock_irqrestore(&logbuf_lock, flags);
if (retry && console_trylock())
goto again;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
wake_up_klogd();
}
@@ -1522,9 +1522,9 @@ void register_console(struct console *newcon)
* console_unlock(); will print out the buffered messages
* for us.
*/
- spin_lock_irqsave(&logbuf_lock, flags);
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
/*
* We're about to replay the log buffer. Only do this to the
* just-registered console to avoid excessive message spam to
@@ -1731,10 +1731,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
/* Theoretically, the log could move on after we do this, but
there's not a lot we can do about that. The new messages
will overwrite the start of what we dump. */
- spin_lock_irqsave(&logbuf_lock, flags);
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
end = log_end & LOG_BUF_MASK;
chars = logged_chars;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
if (chars > end) {
s1 = log_buf + log_buf_len - chars + end;
diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 000000000000..f600868d550d
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,85 @@
+/*
+ * Read-Copy Update definitions shared among RCU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2011
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+
+#ifndef __LINUX_RCU_H
+#define __LINUX_RCU_H
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY 0
+# define STATE_RCU_HEAD_QUEUED 1
+
+extern struct debug_obj_descr rcuhead_debug_descr;
+
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+ WARN_ON_ONCE((unsigned long)head & 0x3);
+ debug_object_activate(head, &rcuhead_debug_descr);
+ debug_object_active_state(head, &rcuhead_debug_descr,
+ STATE_RCU_HEAD_READY,
+ STATE_RCU_HEAD_QUEUED);
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+ debug_object_active_state(head, &rcuhead_debug_descr,
+ STATE_RCU_HEAD_QUEUED,
+ STATE_RCU_HEAD_READY);
+ debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+
+extern void kfree(const void *);
+
+static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
+{
+ unsigned long offset = (unsigned long)head->func;
+
+ if (__is_kfree_rcu_offset(offset)) {
+ RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+ kfree((void *)head - offset);
+ } else {
+ RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+ head->func(head);
+ }
+}
+
+#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be61..ca0d23b6b3e8 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,6 +46,11 @@
#include <linux/module.h>
#include <linux/hardirq.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key rcu_lock_key;
struct lockdep_map rcu_lock_map =
@@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+struct rcu_synchronize {
+ struct rcu_head head;
+ struct completion completion;
+};
+
/*
* Awaken the corresponding synchronize_rcu() instance now that a
* grace period has elapsed.
*/
-void wakeme_after_rcu(struct rcu_head *head)
+static void wakeme_after_rcu(struct rcu_head *head)
{
struct rcu_synchronize *rcu;
@@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head *head)
complete(&rcu->completion);
}
+void wait_rcu_gp(call_rcu_func_t crf)
+{
+ struct rcu_synchronize rcu;
+
+ init_rcu_head_on_stack(&rcu.head);
+ init_completion(&rcu.completion);
+ /* Will wake me after RCU finished. */
+ crf(&rcu.head, wakeme_after_rcu);
+ /* Wait for it. */
+ wait_for_completion(&rcu.completion);
+ destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(wait_rcu_gp);
+
#ifdef CONFIG_PROVE_RCU
/*
* wrapper function to avoid #include problems.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5ab..da775c87f27f 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -37,16 +37,17 @@
#include <linux/cpu.h>
#include <linux/prefetch.h>
-/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
-static struct task_struct *rcu_kthread_task;
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
-static unsigned long have_rcu_kthread_work;
+#ifdef CONFIG_RCU_TRACE
+#include <trace/events/rcu.h>
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+#include "rcu.h"
/* Forward declarations for rcutiny_plugin.h. */
struct rcu_ctrlblk;
-static void invoke_rcu_kthread(void);
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_kthread(void *arg);
+static void invoke_rcu_callbacks(void);
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void rcu_process_callbacks(struct softirq_action *unused);
static void __call_rcu(struct rcu_head *head,
void (*func)(struct rcu_head *rcu),
struct rcu_ctrlblk *rcp);
@@ -96,16 +97,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
}
/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
- have_rcu_kthread_work = 1;
- wake_up(&rcu_kthread_wq);
-}
-
-/*
* Record an rcu quiescent state. And an rcu_bh quiescent state while we
* are at it, given that any rcu quiescent state is also an rcu_bh
* quiescent state. Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +108,7 @@ void rcu_sched_qs(int cpu)
local_irq_save(flags);
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
- invoke_rcu_kthread();
+ invoke_rcu_callbacks();
local_irq_restore(flags);
}
@@ -130,7 +121,7 @@ void rcu_bh_qs(int cpu)
local_irq_save(flags);
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
- invoke_rcu_kthread();
+ invoke_rcu_callbacks();
local_irq_restore(flags);
}
@@ -154,18 +145,23 @@ void rcu_check_callbacks(int cpu, int user)
* Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
* whose grace period has elapsed.
*/
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
{
+ char *rn = NULL;
struct rcu_head *next, *list;
unsigned long flags;
RCU_TRACE(int cb_count = 0);
/* If no RCU callbacks ready to invoke, just return. */
- if (&rcp->rcucblist == rcp->donetail)
+ if (&rcp->rcucblist == rcp->donetail) {
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+ RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
return;
+ }
/* Move the ready-to-invoke callbacks to a local list. */
local_irq_save(flags);
+ RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
list = rcp->rcucblist;
rcp->rcucblist = *rcp->donetail;
*rcp->donetail = NULL;
@@ -176,49 +172,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
local_irq_restore(flags);
/* Invoke the callbacks on the local list. */
+ RCU_TRACE(rn = rcp->name);
while (list) {
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- __rcu_reclaim(list);
+ __rcu_reclaim(rn, list);
local_bh_enable();
list = next;
RCU_TRACE(cb_count++);
}
RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+ RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
}
-/*
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed. It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that was used previously for this purpose.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
+static void rcu_process_callbacks(struct softirq_action *unused)
{
- unsigned long work;
- unsigned long morework;
- unsigned long flags;
-
- for (;;) {
- wait_event_interruptible(rcu_kthread_wq,
- have_rcu_kthread_work != 0);
- morework = rcu_boost();
- local_irq_save(flags);
- work = have_rcu_kthread_work;
- have_rcu_kthread_work = morework;
- local_irq_restore(flags);
- if (work) {
- rcu_process_callbacks(&rcu_sched_ctrlblk);
- rcu_process_callbacks(&rcu_bh_ctrlblk);
- rcu_preempt_process_callbacks();
- }
- schedule_timeout_interruptible(1); /* Leave CPU for others. */
- }
-
- return 0; /* Not reached, but needed to shut gcc up. */
+ __rcu_process_callbacks(&rcu_sched_ctrlblk);
+ __rcu_process_callbacks(&rcu_bh_ctrlblk);
+ rcu_preempt_process_callbacks();
}
/*
@@ -280,45 +253,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
__call_rcu(head, func, &rcu_bh_ctrlblk);
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-
-void rcu_barrier_bh(void)
-{
- struct rcu_synchronize rcu;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu_bh(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-
-void rcu_barrier_sched(void)
-{
- struct rcu_synchronize rcu;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu_sched(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
- struct sched_param sp;
-
- rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
- sp.sched_priority = RCU_BOOST_PRIO;
- sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
- return 0;
-}
-early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195f..02aa7139861c 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -26,29 +26,26 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
-
/* Global control variables for rcupdate callback mechanism. */
struct rcu_ctrlblk {
struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */
struct rcu_head **donetail; /* ->next pointer of last "done" CB. */
struct rcu_head **curtail; /* ->next pointer of last CB. */
RCU_TRACE(long qlen); /* Number of pending CBs. */
+ RCU_TRACE(char *name); /* Name of RCU type. */
};
/* Definition for rcupdate control block. */
static struct rcu_ctrlblk rcu_sched_ctrlblk = {
.donetail = &rcu_sched_ctrlblk.rcucblist,
.curtail = &rcu_sched_ctrlblk.rcucblist,
+ RCU_TRACE(.name = "rcu_sched")
};
static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.donetail = &rcu_bh_ctrlblk.rcucblist,
.curtail = &rcu_bh_ctrlblk.rcucblist,
+ RCU_TRACE(.name = "rcu_bh")
};
#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +128,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
.rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
.nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
.blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+ RCU_TRACE(.rcb.name = "rcu_preempt")
};
static int rcu_preempted_readers_exp(void);
@@ -247,6 +245,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
#include "rtmutex_common.h"
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+
+/* Controls for rcu_kthread() kthread. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
+
/*
* Carry out RCU priority boosting on the task indicated by ->boost_tasks,
* and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -334,7 +339,7 @@ static int rcu_initiate_boost(void)
if (rcu_preempt_ctrlblk.exp_tasks == NULL)
rcu_preempt_ctrlblk.boost_tasks =
rcu_preempt_ctrlblk.gp_tasks;
- invoke_rcu_kthread();
+ invoke_rcu_callbacks();
} else
RCU_TRACE(rcu_initiate_boost_trace());
return 1;
@@ -353,14 +358,6 @@ static void rcu_preempt_boost_start_gp(void)
#else /* #ifdef CONFIG_RCU_BOOST */
/*
- * If there is no RCU priority boosting, we don't boost.
- */
-static int rcu_boost(void)
-{
- return 0;
-}
-
-/*
* If there is no RCU priority boosting, we don't initiate boosting,
* but we do indicate whether there are blocked readers blocking the
* current grace period.
@@ -427,7 +424,7 @@ static void rcu_preempt_cpu_qs(void)
/* If there are done callbacks, cause them to be invoked. */
if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
- invoke_rcu_kthread();
+ invoke_rcu_callbacks();
}
/*
@@ -648,7 +645,7 @@ static void rcu_preempt_check_callbacks(void)
rcu_preempt_cpu_qs();
if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
rcu_preempt_ctrlblk.rcb.donetail)
- invoke_rcu_kthread();
+ invoke_rcu_callbacks();
if (rcu_preempt_gp_in_progress() &&
rcu_cpu_blocking_cur_gp() &&
rcu_preempt_running_reader())
@@ -674,7 +671,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
*/
static void rcu_preempt_process_callbacks(void)
{
- rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+ __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
}
/*
@@ -697,20 +694,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu);
-void rcu_barrier(void)
-{
- struct rcu_synchronize rcu;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
/*
* synchronize_rcu - wait until a grace period has elapsed.
*
@@ -864,15 +847,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
#endif /* #ifdef CONFIG_RCU_TRACE */
/*
- * Because preemptible RCU does not exist, it is never necessary to
- * boost preempted RCU readers.
- */
-static int rcu_boost(void)
-{
- return 0;
-}
-
-/*
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
@@ -898,6 +872,78 @@ static void rcu_preempt_process_callbacks(void)
#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_callbacks(void)
+{
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+}
+
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed. It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
+{
+ unsigned long work;
+ unsigned long morework;
+ unsigned long flags;
+
+ for (;;) {
+ wait_event_interruptible(rcu_kthread_wq,
+ have_rcu_kthread_work != 0);
+ morework = rcu_boost();
+ local_irq_save(flags);
+ work = have_rcu_kthread_work;
+ have_rcu_kthread_work = morework;
+ local_irq_restore(flags);
+ if (work)
+ rcu_process_callbacks(NULL);
+ schedule_timeout_interruptible(1); /* Leave CPU for others. */
+ }
+
+ return 0; /* Not reached, but needed to shut gcc up. */
+}
+
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+ struct sched_param sp;
+
+ rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+ sp.sched_priority = RCU_BOOST_PRIO;
+ sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+ return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+/*
+ * Start up softirq processing of callbacks.
+ */
+void invoke_rcu_callbacks(void)
+{
+ raise_softirq(RCU_SOFTIRQ);
+}
+
+void rcu_init(void)
+{
+ open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
#include <linux/kernel_stat.h>
@@ -913,12 +959,6 @@ void __init rcu_scheduler_starting(void)
#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else /* #ifdef CONFIG_RCU_BOOST */
-#define RCU_BOOST_PRIO 1
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
-
#ifdef CONFIG_RCU_TRACE
#ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7e..764825c2685c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -73,7 +73,7 @@ module_param(nreaders, int, 0444);
MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
module_param(nfakewriters, int, 0444);
MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0444);
+module_param(stat_interval, int, 0644);
MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
module_param(verbose, bool, 0444);
MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
}
-struct rcu_bh_torture_synchronize {
- struct rcu_head head;
- struct completion completion;
-};
-
-static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
-{
- struct rcu_bh_torture_synchronize *rcu;
-
- rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
- complete(&rcu->completion);
-}
-
-static void rcu_bh_torture_synchronize(void)
-{
- struct rcu_bh_torture_synchronize rcu;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
-}
-
static struct rcu_torture_ops rcu_bh_ops = {
.init = NULL,
.cleanup = NULL,
@@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
.readunlock = rcu_bh_torture_read_unlock,
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_bh_torture_deferred_free,
- .sync = rcu_bh_torture_synchronize,
+ .sync = synchronize_rcu_bh,
.cb_barrier = rcu_barrier_bh,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
.readunlock = rcu_bh_torture_read_unlock,
.completed = rcu_bh_torture_completed,
.deferred_free = rcu_sync_torture_deferred_free,
- .sync = rcu_bh_torture_synchronize,
+ .sync = synchronize_rcu_bh,
.cb_barrier = NULL,
.fqs = rcu_bh_force_quiescent_state,
.stats = NULL,
@@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
.name = "rcu_bh_sync"
};
+static struct rcu_torture_ops rcu_bh_expedited_ops = {
+ .init = rcu_sync_torture_init,
+ .cleanup = NULL,
+ .readlock = rcu_bh_torture_read_lock,
+ .read_delay = rcu_read_delay, /* just reuse rcu's version. */
+ .readunlock = rcu_bh_torture_read_unlock,
+ .completed = rcu_bh_torture_completed,
+ .deferred_free = rcu_sync_torture_deferred_free,
+ .sync = synchronize_rcu_bh_expedited,
+ .cb_barrier = NULL,
+ .fqs = rcu_bh_force_quiescent_state,
+ .stats = NULL,
+ .irq_capable = 1,
+ .name = "rcu_bh_expedited"
+};
+
/*
* Definitions for srcu torture testing.
*/
@@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
}
-static void sched_torture_synchronize(void)
-{
- synchronize_sched();
-}
-
static struct rcu_torture_ops sched_ops = {
.init = rcu_sync_torture_init,
.cleanup = NULL,
@@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = {
.readunlock = sched_torture_read_unlock,
.completed = rcu_no_completed,
.deferred_free = rcu_sched_torture_deferred_free,
- .sync = sched_torture_synchronize,
+ .sync = synchronize_sched,
.cb_barrier = rcu_barrier_sched,
.fqs = rcu_sched_force_quiescent_state,
.stats = NULL,
@@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = {
.readunlock = sched_torture_read_unlock,
.completed = rcu_no_completed,
.deferred_free = rcu_sync_torture_deferred_free,
- .sync = sched_torture_synchronize,
+ .sync = synchronize_sched,
.cb_barrier = NULL,
.fqs = rcu_sched_force_quiescent_state,
.stats = NULL,
@@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg)
do {
/* Wait for the next test interval. */
oldstarttime = boost_starttime;
- while (jiffies - oldstarttime > ULONG_MAX / 2) {
+ while (ULONG_CMP_LT(jiffies, oldstarttime)) {
schedule_timeout_uninterruptible(1);
rcu_stutter_wait("rcu_torture_boost");
if (kthread_should_stop() ||
@@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg)
/* Do one boost-test interval. */
endtime = oldstarttime + test_boost_duration * HZ;
call_rcu_time = jiffies;
- while (jiffies - endtime > ULONG_MAX / 2) {
+ while (ULONG_CMP_LT(jiffies, endtime)) {
/* If we don't have a callback in flight, post one. */
if (!rbi.inflight) {
smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg)
* interval. Besides, we are running at RT priority,
* so delays should be relatively rare.
*/
- while (oldstarttime == boost_starttime) {
+ while (oldstarttime == boost_starttime &&
+ !kthread_should_stop()) {
if (mutex_trylock(&boost_mutex)) {
boost_starttime = jiffies +
test_boost_interval * HZ;
@@ -809,11 +797,11 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
/* Clean up and exit. */
VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
- destroy_rcu_head_on_stack(&rbi.rcu);
rcutorture_shutdown_absorb("rcu_torture_boost");
while (!kthread_should_stop() || rbi.inflight)
schedule_timeout_uninterruptible(1);
smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+ destroy_rcu_head_on_stack(&rbi.rcu);
return 0;
}
@@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg)
VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
do {
fqs_resume_time = jiffies + fqs_stutter * HZ;
- while (jiffies - fqs_resume_time > LONG_MAX) {
+ while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+ !kthread_should_stop()) {
schedule_timeout_interruptible(1);
}
fqs_burst_remaining = fqs_duration;
- while (fqs_burst_remaining > 0) {
+ while (fqs_burst_remaining > 0 &&
+ !kthread_should_stop()) {
cur_ops->fqs();
udelay(fqs_holdoff);
fqs_burst_remaining -= fqs_holdoff;
@@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu)
/* Don't allow time recalculation while creating a new task. */
mutex_lock(&boost_mutex);
VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
- boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
- "rcu_torture_boost");
+ boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
+ cpu_to_node(cpu),
+ "rcu_torture_boost");
if (IS_ERR(boost_tasks[cpu])) {
retval = PTR_ERR(boost_tasks[cpu]);
VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1424,7 +1415,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] =
{ &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
- &rcu_bh_ops, &rcu_bh_sync_ops,
+ &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
&srcu_ops, &srcu_expedited_ops,
&sched_ops, &sched_sync_ops, &sched_expedited_ops, };
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd3..e234eb92a177 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -52,13 +52,16 @@
#include <linux/prefetch.h>
#include "rcutree.h"
+#include <trace/events/rcu.h>
+
+#include "rcu.h"
/* Data structures. */
static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
#define RCU_STATE_INITIALIZER(structname) { \
- .level = { &structname.node[0] }, \
+ .level = { &structname##_state.node[0] }, \
.levelcnt = { \
NUM_RCU_LVL_0, /* root of hierarchy. */ \
NUM_RCU_LVL_1, \
@@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
.signaled = RCU_GP_IDLE, \
.gpnum = -300, \
.completed = -300, \
- .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
- .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+ .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
+ .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
.n_force_qs = 0, \
.n_force_qs_ngp = 0, \
.name = #structname, \
}
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
-#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
-
/*
* Track the rcutorture test sequence number and the update version
* number within a given test. The rcutorture_testseq is incremented
@@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
* Note a quiescent state. Because we do not need to know
* how many quiescent states passed, just if there was at least
* one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
*/
void rcu_sched_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
- rdp->passed_quiesc_completed = rdp->gpnum - 1;
+ rdp->passed_quiesce_gpnum = rdp->gpnum;
barrier();
- rdp->passed_quiesc = 1;
+ if (rdp->passed_quiesce == 0)
+ trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+ rdp->passed_quiesce = 1;
}
void rcu_bh_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
- rdp->passed_quiesc_completed = rdp->gpnum - 1;
+ rdp->passed_quiesce_gpnum = rdp->gpnum;
barrier();
- rdp->passed_quiesc = 1;
+ if (rdp->passed_quiesce == 0)
+ trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+ rdp->passed_quiesce = 1;
}
/*
* Note a context switch. This is a quiescent state for RCU-sched,
* and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
*/
void rcu_note_context_switch(int cpu)
{
+ trace_rcu_utilization("Start context switch");
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
+ trace_rcu_utilization("End context switch");
}
EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
};
#endif /* #ifdef CONFIG_NO_HZ */
-static int blimit = 10; /* Maximum callbacks per softirq. */
+static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */
static int qhimark = 10000; /* If this many pending, ignore blimit. */
static int qlowmark = 100; /* Once only this many pending, use blimit. */
@@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
* trust its state not to change because interrupts are disabled.
*/
if (cpu_is_offline(rdp->cpu)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
rdp->offline_fqs++;
return 1;
}
@@ -354,19 +364,13 @@ void rcu_enter_nohz(void)
local_irq_restore(flags);
return;
}
+ trace_rcu_dyntick("Start");
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic_inc(); /* See above. */
atomic_inc(&rdtp->dynticks);
smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
local_irq_restore(flags);
-
- /* If the interrupt queued a callback, get out of dyntick mode. */
- if (in_irq() &&
- (__get_cpu_var(rcu_sched_data).nxtlist ||
- __get_cpu_var(rcu_bh_data).nxtlist ||
- rcu_preempt_needs_cpu(smp_processor_id())))
- set_need_resched();
}
/*
@@ -391,6 +395,7 @@ void rcu_exit_nohz(void)
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic_inc(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+ trace_rcu_dyntick("End");
local_irq_restore(flags);
}
@@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
- unsigned long curr;
- unsigned long snap;
+ unsigned int curr;
+ unsigned int snap;
- curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
- snap = (unsigned long)rdp->dynticks_snap;
+ curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned int)rdp->dynticks_snap;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+ if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+ trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
rdp->dynticks_fqs++;
return 1;
}
@@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
int cpu;
long delta;
unsigned long flags;
+ int ndetected;
struct rcu_node *rnp = rcu_get_root(rsp);
/* Only let one CPU complain about others per time interval. */
@@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
* Now rat on any tasks that got kicked up to the root rcu_node
* due to CPU offlining.
*/
- rcu_print_task_stall(rnp);
+ ndetected = rcu_print_task_stall(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
/*
@@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
rsp->name);
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave(&rnp->lock, flags);
- rcu_print_task_stall(rnp);
+ ndetected += rcu_print_task_stall(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (rnp->qsmask == 0)
continue;
for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
- if (rnp->qsmask & (1UL << cpu))
+ if (rnp->qsmask & (1UL << cpu)) {
printk(" %d", rnp->grplo + cpu);
+ ndetected++;
+ }
}
printk("} (detected by %d, t=%ld jiffies)\n",
smp_processor_id(), (long)(jiffies - rsp->gp_start));
- trigger_all_cpu_backtrace();
+ if (ndetected == 0)
+ printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+ else if (!trigger_all_cpu_backtrace())
+ dump_stack();
/* If so configured, complain about tasks blocking the grace period. */
@@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
*/
printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
- trigger_all_cpu_backtrace();
+ if (!trigger_all_cpu_backtrace())
+ dump_stack();
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
* go looking for one.
*/
rdp->gpnum = rnp->gpnum;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
if (rnp->qsmask & rdp->grpmask) {
rdp->qs_pending = 1;
- rdp->passed_quiesc = 0;
+ rdp->passed_quiesce = 0;
} else
rdp->qs_pending = 0;
}
@@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
/* Remember that we saw this grace-period completion. */
rdp->completed = rnp->completed;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
/*
* If we were in an extended quiescent state, we may have
@@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
- if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
- if (cpu_needs_another_gp(rsp, rdp))
- rsp->fqs_need_gp = 1;
- if (rnp->completed == rsp->completed) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- return;
- }
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ if (!rcu_scheduler_fully_active ||
+ !cpu_needs_another_gp(rsp, rdp)) {
+ /*
+ * Either the scheduler hasn't yet spawned the first
+ * non-idle task or this CPU does not need another
+ * grace period. Either way, don't start a new grace
+ * period.
+ */
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rsp->fqs_active) {
/*
- * Propagate new ->completed value to rcu_node structures
- * so that other CPUs don't have to wait until the start
- * of the next grace period to process their callbacks.
+ * This CPU needs a grace period, but force_quiescent_state()
+ * is running. Tell it to start one on this CPU's behalf.
*/
- rcu_for_each_node_breadth_first(rsp, rnp) {
- raw_spin_lock(&rnp->lock); /* irqs already disabled. */
- rnp->completed = rsp->completed;
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
- }
- local_irq_restore(flags);
+ rsp->fqs_need_gp = 1;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
/* Advance to a new grace period and initialize state. */
rsp->gpnum++;
+ trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
@@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
rcu_start_gp_per_cpu(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
+ trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+ rnp->level, rnp->grplo,
+ rnp->grphi, rnp->qsmask);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
rcu_preempt_boost_start_gp(rnp);
+ trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+ rnp->level, rnp->grplo,
+ rnp->grphi, rnp->qsmask);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
@@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
unsigned long gp_duration;
+ struct rcu_node *rnp = rcu_get_root(rsp);
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
@@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
gp_duration = jiffies - rsp->gp_start;
if (gp_duration > rsp->gp_max)
rsp->gp_max = gp_duration;
- rsp->completed = rsp->gpnum;
+
+ /*
+ * We know the grace period is complete, but to everyone else
+ * it appears to still be ongoing. But it is also the case
+ * that to everyone else it looks like there is nothing that
+ * they can do to advance the grace period. It is therefore
+ * safe for us to drop the lock in order to mark the grace
+ * period as completed in all of the rcu_node structures.
+ *
+ * But if this CPU needs another grace period, it will take
+ * care of this while initializing the next grace period.
+ * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+ * because the callbacks have not yet been advanced: Those
+ * callbacks are waiting on the grace period that just now
+ * completed.
+ */
+ if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+
+ /*
+ * Propagate new ->completed value to rcu_node structures
+ * so that other CPUs don't have to wait until the start
+ * of the next grace period to process their callbacks.
+ */
+ rcu_for_each_node_breadth_first(rsp, rnp) {
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ rnp->completed = rsp->gpnum;
+ raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ }
+ rnp = rcu_get_root(rsp);
+ raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+ }
+
+ rsp->completed = rsp->gpnum; /* Declare the grace period complete. */
+ trace_rcu_grace_period(rsp->name, rsp->completed, "end");
rsp->signaled = RCU_GP_IDLE;
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
}
@@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
rnp->qsmask &= ~mask;
+ trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+ mask, rnp->qsmask, rnp->level,
+ rnp->grplo, rnp->grphi,
+ !!rnp->gp_tasks);
if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
/* Other bits still set at this level, so done. */
@@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
* based on quiescent states detected in an earlier grace period!
*/
static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
{
unsigned long flags;
unsigned long mask;
@@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
- if (lastcomp != rnp->completed) {
+ if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
/*
- * Someone beat us to it for this grace period, so leave.
- * The race with GP start is resolved by the fact that we
- * hold the leaf rcu_node lock, so that the per-CPU bits
- * cannot yet be initialized -- so we would simply find our
- * CPU's bit already cleared in rcu_report_qs_rnp() if this
- * race occurred.
+ * The grace period in which this quiescent state was
+ * recorded has ended, so don't report it upwards.
+ * We will instead need a new quiescent state that lies
+ * within the current grace period.
*/
- rdp->passed_quiesc = 0; /* try again later! */
+ rdp->passed_quiesce = 0; /* need qs for new gp. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
* Was there a quiescent state since the beginning of the grace
* period? If no, then exit and wait for the next call.
*/
- if (!rdp->passed_quiesc)
+ if (!rdp->passed_quiesce)
return;
/*
* Tell RCU we are done (but rcu_report_qs_rdp() will be the
* judge of that).
*/
- rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+ rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
if (rnp->qsmaskinit != 0) {
if (rnp != rdp->mynode)
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+ else
+ trace_rcu_grace_period(rsp->name,
+ rnp->gpnum + 1 -
+ !!(rnp->qsmask & mask),
+ "cpuofl");
break;
}
- if (rnp == rdp->mynode)
+ if (rnp == rdp->mynode) {
+ trace_rcu_grace_period(rsp->name,
+ rnp->gpnum + 1 -
+ !!(rnp->qsmask & mask),
+ "cpuofl");
need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
- else
+ } else
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
mask = rnp->grpmask;
rnp = rnp->parent;
@@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *next, *list, **tail;
- int count;
+ int bl, count;
/* If no callbacks are ready, just return.*/
- if (!cpu_has_callbacks_ready_to_invoke(rdp))
+ if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+ trace_rcu_batch_start(rsp->name, 0, 0);
+ trace_rcu_batch_end(rsp->name, 0);
return;
+ }
/*
* Extract the list of ready callbacks, disabling to prevent
* races with call_rcu() from interrupt handlers.
*/
local_irq_save(flags);
+ bl = rdp->blimit;
+ trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
list = rdp->nxtlist;
rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
*rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- __rcu_reclaim(list);
+ __rcu_reclaim(rsp->name, list);
list = next;
- if (++count >= rdp->blimit)
+ if (++count >= bl)
break;
}
local_irq_save(flags);
+ trace_rcu_batch_end(rsp->name, count);
/* Update count, and requeue any remaining callbacks. */
rdp->qlen -= count;
@@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
local_irq_restore(flags);
- /* Re-raise the RCU softirq if there are callbacks remaining. */
+ /* Re-invoke RCU core processing if there are callbacks remaining. */
if (cpu_has_callbacks_ready_to_invoke(rdp))
invoke_rcu_core();
}
@@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/*
* Check to see if this CPU is in a non-context-switch quiescent state
* (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule the RCU softirq handler.
+ * Also schedule RCU core processing.
*
* This function must be called with hardirqs disabled. It is normally
* invoked from the scheduling-clock interrupt. If rcu_pending returns
@@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
*/
void rcu_check_callbacks(int cpu, int user)
{
+ trace_rcu_utilization("Start scheduler-tick");
if (user ||
(idle_cpu(cpu) && rcu_scheduler_active &&
!in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user)
rcu_preempt_check_callbacks(cpu);
if (rcu_pending(cpu))
invoke_rcu_core();
+ trace_rcu_utilization("End scheduler-tick");
}
#ifdef CONFIG_SMP
@@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
unsigned long flags;
struct rcu_node *rnp = rcu_get_root(rsp);
- if (!rcu_gp_in_progress(rsp))
+ trace_rcu_utilization("Start fqs");
+ if (!rcu_gp_in_progress(rsp)) {
+ trace_rcu_utilization("End fqs");
return; /* No grace period in progress, nothing to force. */
+ }
if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
+ trace_rcu_utilization("End fqs");
return; /* Someone else is already on the job. */
}
if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
rsp->fqs_need_gp = 0;
rcu_start_gp(rsp, flags); /* releases rnp->lock */
+ trace_rcu_utilization("End fqs");
return;
}
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
unlock_fqs_ret:
raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+ trace_rcu_utilization("End fqs");
}
#else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
#endif /* #else #ifdef CONFIG_SMP */
/*
- * This does the RCU processing work from softirq context for the
- * specified rcu_state and rcu_data structures. This may be called
- * only from the CPU to whom the rdp belongs.
+ * This does the RCU core processing work for the specified rcu_state
+ * and rcu_data structures. This may be called only from the CPU to
+ * whom the rdp belongs.
*/
static void
__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Do softirq processing for the current CPU.
+ * Do RCU core processing for the current CPU.
*/
static void rcu_process_callbacks(struct softirq_action *unused)
{
+ trace_rcu_utilization("Start RCU core");
__rcu_process_callbacks(&rcu_sched_state,
&__get_cpu_var(rcu_sched_data));
__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
rcu_preempt_process_callbacks();
-
- /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
- rcu_needs_cpu_flush();
+ trace_rcu_utilization("End RCU core");
}
/*
- * Wake up the current CPU's kthread. This replaces raise_softirq()
- * in earlier versions of RCU. Note that because we are running on
- * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
- * cannot disappear out from under us.
+ * Schedule RCU callback invocation. If the specified type of RCU
+ * does not support RCU priority boosting, just do a direct call,
+ * otherwise wake up the per-CPU kernel kthread. Note that because we
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
@@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
rdp->qlen++;
+ if (__is_kfree_rcu_offset((unsigned long)func))
+ trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+ rdp->qlen);
+ else
+ trace_rcu_callback(rsp->name, head, rdp->qlen);
+
/* If interrupts were disabled, don't dive into RCU core. */
if (irqs_disabled_flags(flags)) {
local_irq_restore(flags);
@@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
*/
void synchronize_sched(void)
{
- struct rcu_synchronize rcu;
-
if (rcu_blocking_is_gp())
return;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu_sched(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ wait_rcu_gp(call_rcu_sched);
}
EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
*/
void synchronize_rcu_bh(void)
{
- struct rcu_synchronize rcu;
-
if (rcu_blocking_is_gp())
return;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu_bh(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ wait_rcu_gp(call_rcu_bh);
}
EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
@@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
check_cpu_stall(rsp, rdp);
/* Is the RCU core waiting for a quiescent state from this CPU? */
- if (rdp->qs_pending && !rdp->passed_quiesc) {
+ if (rcu_scheduler_fully_active &&
+ rdp->qs_pending && !rdp->passed_quiesce) {
/*
* If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
jiffies))
set_need_resched();
- } else if (rdp->qs_pending && rdp->passed_quiesc) {
+ } else if (rdp->qs_pending && rdp->passed_quiesce) {
rdp->n_rp_report_qs++;
return 1;
}
@@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
#endif /* #ifdef CONFIG_NO_HZ */
rdp->cpu = cpu;
+ rdp->rsp = rsp;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave(&rnp->lock, flags);
- rdp->passed_quiesc = 0; /* We could be racing with new GP, */
- rdp->qs_pending = 1; /* so set up to respond to current GP. */
rdp->beenonline = 1; /* We have now been online. */
rdp->preemptible = preemptible;
rdp->qlen_last_fqs_check = 0;
@@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
rnp->qsmaskinit |= mask;
mask = rnp->grpmask;
if (rnp == rdp->mynode) {
- rdp->gpnum = rnp->completed; /* if GP in progress... */
+ /*
+ * If there is a grace period in progress, we will
+ * set up to wait for it next time we run the
+ * RCU core code.
+ */
+ rdp->gpnum = rnp->completed;
rdp->completed = rnp->completed;
- rdp->passed_quiesc_completed = rnp->completed - 1;
+ rdp->passed_quiesce = 0;
+ rdp->qs_pending = 0;
+ rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
+ trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
}
raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
rnp = rnp->parent;
@@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
struct rcu_node *rnp = rdp->mynode;
+ trace_rcu_utilization("Start CPU hotplug");
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
@@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
default:
break;
}
+ trace_rcu_utilization("End CPU hotplug");
return NOTIFY_OK;
}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26fb..849ce9ec51fe 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -230,9 +230,9 @@ struct rcu_data {
/* in order to detect GP end. */
unsigned long gpnum; /* Highest gp number that this CPU */
/* is aware of having started. */
- unsigned long passed_quiesc_completed;
- /* Value of completed at time of qs. */
- bool passed_quiesc; /* User-mode/idle loop etc. */
+ unsigned long passed_quiesce_gpnum;
+ /* gpnum at time of quiescent state. */
+ bool passed_quiesce; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
bool preemptible; /* Preemptible RCU? */
@@ -299,6 +299,7 @@ struct rcu_data {
unsigned long n_rp_need_nothing;
int cpu;
+ struct rcu_state *rsp;
};
/* Values for signaled field in struct rcu_state. */
@@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state;
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
#ifndef RCU_TREE_NONCORE
/* Forward declarations for rcutree_plugin.h */
@@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
static void rcu_stop_cpu_kthread(int cpu);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static void rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_stall_reset(void);
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
@@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu);
static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
-static void rcu_needs_cpu_flush(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
static void invoke_rcu_callbacks_kthread(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b0..4b9b9f8a4184 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
#include <linux/delay.h>
#include <linux/stop_machine.h>
+#define RCU_KTHREAD_PRIO 1
+
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else
+#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
+#endif
+
/*
* Check the RCU kernel configuration parameters and print informative
* messages about anything out of the ordinary. If you like #ifdef, you
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu)
{
struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
- rdp->passed_quiesc_completed = rdp->gpnum - 1;
+ rdp->passed_quiesce_gpnum = rdp->gpnum;
barrier();
- rdp->passed_quiesc = 1;
+ if (rdp->passed_quiesce == 0)
+ trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+ rdp->passed_quiesce = 1;
current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
}
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu)
if (rnp->qsmask & rdp->grpmask)
rnp->gp_tasks = &t->rcu_node_entry;
}
+ trace_rcu_preempt_task(rdp->rsp->name,
+ t->pid,
+ (rnp->qsmask & rdp->grpmask)
+ ? rnp->gpnum
+ : rnp->gpnum + 1);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
} else if (t->rcu_read_lock_nesting < 0 &&
t->rcu_read_unlock_special) {
@@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
int empty_exp;
unsigned long flags;
struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+ struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
int special;
@@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
+ t->rcu_blocked_node = NULL;
+ trace_rcu_unlock_preempted_task("rcu_preempt",
+ rnp->gpnum, t->pid);
if (&t->rcu_node_entry == rnp->gp_tasks)
rnp->gp_tasks = np;
if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
- /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
- if (t->rcu_boosted) {
- special |= RCU_READ_UNLOCK_BOOSTED;
- t->rcu_boosted = 0;
+ /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
+ if (t->rcu_boost_mutex) {
+ rbmp = t->rcu_boost_mutex;
+ t->rcu_boost_mutex = NULL;
}
#endif /* #ifdef CONFIG_RCU_BOOST */
- t->rcu_blocked_node = NULL;
/*
* If this was the last task on the current list, and if
* we aren't waiting on any CPUs, report the quiescent state.
* Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
*/
- if (empty)
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- else
+ if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
+ trace_rcu_quiescent_state_report("preempt_rcu",
+ rnp->gpnum,
+ 0, rnp->qsmask,
+ rnp->level,
+ rnp->grplo,
+ rnp->grphi,
+ !!rnp->gp_tasks);
rcu_report_unblock_qs_rnp(rnp, flags);
+ } else
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (special & RCU_READ_UNLOCK_BOOSTED) {
- rt_mutex_unlock(t->rcu_boost_mutex);
- t->rcu_boost_mutex = NULL;
- }
+ if (rbmp)
+ rt_mutex_unlock(rbmp);
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -399,10 +424,10 @@ void __rcu_read_unlock(void)
{
struct task_struct *t = current;
- barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
if (t->rcu_read_lock_nesting != 1)
--t->rcu_read_lock_nesting;
else {
+ barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
* Scan the current list of tasks blocked within RCU read-side critical
* sections, printing out the tid of each.
*/
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
{
struct task_struct *t;
+ int ndetected = 0;
if (!rcu_preempt_blocked_readers_cgp(rnp))
- return;
+ return 0;
t = list_entry(rnp->gp_tasks,
struct task_struct, rcu_node_entry);
- list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
printk(" P%d", t->pid);
+ ndetected++;
+ }
+ return ndetected;
}
/*
@@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
*/
void synchronize_rcu(void)
{
- struct rcu_synchronize rcu;
-
if (!rcu_scheduler_active)
return;
-
- init_rcu_head_on_stack(&rcu.head);
- init_completion(&rcu.completion);
- /* Will wake me after RCU finished. */
- call_rcu(&rcu.head, wakeme_after_rcu);
- /* Wait for it. */
- wait_for_completion(&rcu.completion);
- destroy_rcu_head_on_stack(&rcu.head);
+ wait_rcu_gp(call_rcu);
}
EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
* Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
{
+ return 0;
}
/*
@@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
#endif /* #else #ifdef CONFIG_RCU_TRACE */
+static struct lock_class_key rcu_boost_class;
+
/*
* Carry out RCU priority boosting on the task indicated by ->exp_tasks
* or ->boost_tasks, advancing the pointer to the next task in the
@@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp)
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
+ /* Avoid lockdep false positives. This rt_mutex is its own thing. */
+ lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
+ "rcu_boost_mutex");
t->rcu_boost_mutex = &mtx;
- t->rcu_boosted = 1;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
@@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg)
int spincnt = 0;
int more2boost;
+ trace_rcu_utilization("Start boost kthread@init");
for (;;) {
rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ trace_rcu_utilization("End boost kthread@rcu_wait");
rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+ trace_rcu_utilization("Start boost kthread@rcu_wait");
rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
more2boost = rcu_boost(rnp);
if (more2boost)
@@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg)
else
spincnt = 0;
if (spincnt > 10) {
+ trace_rcu_utilization("End boost kthread@rcu_yield");
rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ trace_rcu_utilization("Start boost kthread@rcu_yield");
spincnt = 0;
}
}
/* NOTREACHED */
+ trace_rcu_utilization("End boost kthread@notreached");
return 0;
}
@@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void)
local_irq_save(flags);
__this_cpu_write(rcu_cpu_has_work, 1);
- if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
- local_irq_restore(flags);
- return;
- }
- wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
+ current != __this_cpu_read(rcu_cpu_kthread_task))
+ wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
local_irq_restore(flags);
}
@@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
if (rnp->boost_kthread_task != NULL)
return 0;
t = kthread_create(rcu_boost_kthread, (void *)rnp,
- "rcub%d", rnp_index);
+ "rcub/%d", rnp_index);
if (IS_ERR(t))
return PTR_ERR(t);
raw_spin_lock_irqsave(&rnp->lock, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = RCU_KTHREAD_PRIO;
+ sp.sched_priority = RCU_BOOST_PRIO;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
@@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
{
struct sched_param sp;
struct timer_list yield_timer;
+ int prio = current->rt_priority;
setup_timer_on_stack(&yield_timer, f, arg);
mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
set_user_nice(current, 19);
schedule();
- sp.sched_priority = RCU_KTHREAD_PRIO;
+ set_user_nice(current, 0);
+ sp.sched_priority = prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
del_timer(&yield_timer);
}
@@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
/*
* Per-CPU kernel thread that invokes RCU callbacks. This replaces the
- * earlier RCU softirq.
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
*/
static int rcu_cpu_kthread(void *arg)
{
@@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg)
char work;
char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+ trace_rcu_utilization("Start CPU kthread@init");
for (;;) {
*statusp = RCU_KTHREAD_WAITING;
+ trace_rcu_utilization("End CPU kthread@rcu_wait");
rcu_wait(*workp != 0 || kthread_should_stop());
+ trace_rcu_utilization("Start CPU kthread@rcu_wait");
local_bh_disable();
if (rcu_cpu_kthread_should_stop(cpu)) {
local_bh_enable();
@@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg)
spincnt = 0;
if (spincnt > 10) {
*statusp = RCU_KTHREAD_YIELDING;
+ trace_rcu_utilization("End CPU kthread@rcu_yield");
rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ trace_rcu_utilization("Start CPU kthread@rcu_yield");
spincnt = 0;
}
}
*statusp = RCU_KTHREAD_STOPPED;
+ trace_rcu_utilization("End CPU kthread@term");
return 0;
}
@@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
if (!rcu_scheduler_fully_active ||
per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
return 0;
- t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+ t = kthread_create_on_node(rcu_cpu_kthread,
+ (void *)(long)cpu,
+ cpu_to_node(cpu),
+ "rcuc/%d", cpu);
if (IS_ERR(t))
return PTR_ERR(t);
if (cpu_online(cpu))
@@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
return 0;
if (rnp->node_kthread_task == NULL) {
t = kthread_create(rcu_node_kthread, (void *)rnp,
- "rcun%d", rnp_index);
+ "rcun/%d", rnp_index);
if (IS_ERR(t))
return PTR_ERR(t);
raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu)
return rcu_needs_cpu_quick_check(cpu);
}
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
- * entry is not configured, so we never do need to.
- */
-static void rcu_needs_cpu_flush(void)
-{
-}
-
#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
#define RCU_NEEDS_CPU_FLUSHES 5
@@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu)
return c;
}
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.
- */
-static void rcu_needs_cpu_flush(void)
-{
- int cpu = smp_processor_id();
- unsigned long flags;
-
- if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
- return;
- local_irq_save(flags);
- (void)rcu_needs_cpu(cpu);
- local_irq_restore(flags);
-}
-
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc0..9feffa4c0695 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
#ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
-
static char convert_kthread_status(unsigned int kthread_status)
{
if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
return;
- seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
+ seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
rdp->cpu,
cpu_is_offline(rdp->cpu) ? '!' : ' ',
rdp->completed, rdp->gpnum,
- rdp->passed_quiesc, rdp->passed_quiesc_completed,
+ rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, " dt=%d/%d/%d df=%lu",
@@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->cpu,
cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
rdp->completed, rdp->gpnum,
- rdp->passed_quiesc, rdp->passed_quiesc_completed,
+ rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, ",%d,%d,%d,%lu",
@@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
static int show_rcudata_csv(struct seq_file *m, void *unused)
{
- seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
+ seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
#ifdef CONFIG_NO_HZ
seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33be..a2e7e7210f3e 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -29,61 +29,6 @@
#include "rtmutex_common.h"
-# define TRACE_WARN_ON(x) WARN_ON(x)
-# define TRACE_BUG_ON(x) BUG_ON(x)
-
-# define TRACE_OFF() \
-do { \
- if (rt_trace_on) { \
- rt_trace_on = 0; \
- console_verbose(); \
- if (raw_spin_is_locked(&current->pi_lock)) \
- raw_spin_unlock(&current->pi_lock); \
- } \
-} while (0)
-
-# define TRACE_OFF_NOLOCK() \
-do { \
- if (rt_trace_on) { \
- rt_trace_on = 0; \
- console_verbose(); \
- } \
-} while (0)
-
-# define TRACE_BUG_LOCKED() \
-do { \
- TRACE_OFF(); \
- BUG(); \
-} while (0)
-
-# define TRACE_WARN_ON_LOCKED(c) \
-do { \
- if (unlikely(c)) { \
- TRACE_OFF(); \
- WARN_ON(1); \
- } \
-} while (0)
-
-# define TRACE_BUG_ON_LOCKED(c) \
-do { \
- if (unlikely(c)) \
- TRACE_BUG_LOCKED(); \
-} while (0)
-
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
-#endif
-
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-static int rt_trace_on = 1;
-
static void printk_task(struct task_struct *p)
{
if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
void rt_mutex_debug_task_free(struct task_struct *task)
{
- WARN_ON(!plist_head_empty(&task->pi_waiters));
- WARN_ON(task->pi_blocked_on);
+ DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
+ DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
}
/*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
{
struct task_struct *task;
- if (!rt_trace_on || detect || !act_waiter)
+ if (!debug_locks || detect || !act_waiter)
return;
task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
{
struct task_struct *task;
- if (!waiter->deadlock_lock || !rt_trace_on)
+ if (!waiter->deadlock_lock || !debug_locks)
return;
rcu_read_lock();
@@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
return;
}
- TRACE_OFF_NOLOCK();
+ if (!debug_locks_off()) {
+ rcu_read_unlock();
+ return;
+ }
printk("\n============================================\n");
printk( "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
printk("[ turning off deadlock detection."
"Please report this trace. ]\n\n");
- local_irq_disable();
}
void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
void debug_rt_mutex_unlock(struct rt_mutex *lock)
{
- TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+ DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
}
void
@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
{
- TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+ DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
}
void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
{
put_pid(waiter->deadlock_task_pid);
- TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
- TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+ DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
+ DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
memset(waiter, 0x22, sizeof(*waiter));
}
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acdb..5e8d9cce7470 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct rt_mutex_waiter *waiter)
{
int ret = 0;
+ int was_disabled;
for (;;) {
/* Try to acquire the lock: */
@@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
raw_spin_unlock(&lock->wait_lock);
+ was_disabled = irqs_disabled();
+ if (was_disabled)
+ local_irq_enable();
+
debug_rt_mutex_print_deadlock(waiter);
schedule_rt_mutex(lock);
+ if (was_disabled)
+ local_irq_disable();
+
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index 24637c782002..d87c6e5d4e8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1895,7 +1895,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
#ifdef CONFIG_SMP
/*
* After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
- * successfuly executed on another CPU. We must ensure that updates of
+ * successfully executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment.
*/
smp_wmb();
@@ -4346,6 +4346,7 @@ static inline void schedule_debug(struct task_struct *prev)
*/
if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
__schedule_bug(prev);
+ rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -6101,15 +6102,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
}
/*
- * In a system that switches off the HZ timer nohz_cpu_mask
- * indicates which cpus entered this state. This is used
- * in the rcu update to wait only for active cpus. For system
- * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_BITS_NONE.
- */
-cpumask_var_t nohz_cpu_mask;
-
-/*
* Increase the granularity value when there are more CPUs,
* because with more CPUs the 'effective latency' as visible
* to users decreases. But the relationship is not linear,
@@ -8348,8 +8340,6 @@ void __init sched_init(void)
*/
current->sched_class = &fair_sched_class;
- /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
- zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ
@@ -8379,6 +8369,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy; /* ratelimiting */
+ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
system_state != SYSTEM_RUNNING || oops_in_progress)
return;
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 331e01bcd026..87f9e36ea56e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk,
if (!cputimer->running)
return;
- spin_lock(&cputimer->lock);
+ raw_spin_lock(&cputimer->lock);
cputimer->cputime.utime =
cputime_add(cputimer->cputime.utime, cputime);
- spin_unlock(&cputimer->lock);
+ raw_spin_unlock(&cputimer->lock);
}
/**
@@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk,
if (!cputimer->running)
return;
- spin_lock(&cputimer->lock);
+ raw_spin_lock(&cputimer->lock);
cputimer->cputime.stime =
cputime_add(cputimer->cputime.stime, cputime);
- spin_unlock(&cputimer->lock);
+ raw_spin_unlock(&cputimer->lock);
}
/**
@@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (!cputimer->running)
return;
- spin_lock(&cputimer->lock);
+ raw_spin_lock(&cputimer->lock);
cputimer->cputime.sum_exec_runtime += ns;
- spin_unlock(&cputimer->lock);
+ raw_spin_unlock(&cputimer->lock);
}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ade..d831841e55a7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
{
unsigned long flags;
- spin_lock_irqsave(&sem->lock, flags);
+ raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
__down(sem);
- spin_unlock_irqrestore(&sem->lock, flags);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(down);
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
unsigned long flags;
int result = 0;
- spin_lock_irqsave(&sem->lock, flags);
+ raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
result = __down_interruptible(sem);
- spin_unlock_irqrestore(&sem->lock, flags);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
unsigned long flags;
int result = 0;
- spin_lock_irqsave(&sem->lock, flags);
+ raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
result = __down_killable(sem);
- spin_unlock_irqrestore(&sem->lock, flags);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
unsigned long flags;
int count;
- spin_lock_irqsave(&sem->lock, flags);
+ raw_spin_lock_irqsave(&sem->lock, flags);
count = sem->count - 1;
if (likely(count >= 0))
sem->count = count;
- spin_unlock_irqrestore(&sem->lock, flags);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
return (count < 0);
}
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
unsigned long flags;
int result = 0;
- spin_lock_irqsave(&sem->lock, flags);
+ raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(sem->count > 0))
sem->count--;
else
result = __down_timeout(sem, jiffies);
- spin_unlock_irqrestore(&sem->lock, flags);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
return result;
}
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
{
unsigned long flags;
- spin_lock_irqsave(&sem->lock, flags);
+ raw_spin_lock_irqsave(&sem->lock, flags);
if (likely(list_empty(&sem->wait_list)))
sem->count++;
else
__up(sem);
- spin_unlock_irqrestore(&sem->lock, flags);
+ raw_spin_unlock_irqrestore(&sem->lock, flags);
}
EXPORT_SYMBOL(up);
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
if (timeout <= 0)
goto timed_out;
__set_task_state(task, state);
- spin_unlock_irq(&sem->lock);
+ raw_spin_unlock_irq(&sem->lock);
timeout = schedule_timeout(timeout);
- spin_lock_irq(&sem->lock);
+ raw_spin_lock_irq(&sem->lock);
if (waiter.up)
return 0;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be75..d252be2d3de5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
return error;
}
+static int kill_as_cred_perm(const struct cred *cred,
+ struct task_struct *target)
+{
+ const struct cred *pcred = __task_cred(target);
+ if (cred->user_ns != pcred->user_ns)
+ return 0;
+ if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
+ cred->uid != pcred->suid && cred->uid != pcred->uid)
+ return 0;
+ return 1;
+}
+
/* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
- uid_t uid, uid_t euid, u32 secid)
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
+ const struct cred *cred, u32 secid)
{
int ret = -EINVAL;
struct task_struct *p;
- const struct cred *pcred;
unsigned long flags;
if (!valid_signal(sig))
@@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
ret = -ESRCH;
goto out_unlock;
}
- pcred = __task_cred(p);
- if (si_fromuser(info) &&
- euid != pcred->suid && euid != pcred->uid &&
- uid != pcred->suid && uid != pcred->uid) {
+ if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
ret = -EPERM;
goto out_unlock;
}
@@ -1384,7 +1392,7 @@ out_unlock:
rcu_read_unlock();
return ret;
}
-EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
/*
* kill_something_info() interprets pid in interesting ways just like kill(2).
diff --git a/kernel/sys.c b/kernel/sys.c
index 18ee1d2f6474..58459509b14c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1172,7 +1172,7 @@ DECLARE_RWSEM(uts_sem);
static int override_release(char __user *release, int len)
{
int ret = 0;
- char buf[len];
+ char buf[65];
if (current->personality & UNAME26) {
char *rest = UTS_RELEASE;
@@ -1759,6 +1759,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
sizeof(me->comm) - 1) < 0)
return -EFAULT;
set_task_comm(me, comm);
+ proc_comm_connector(me);
return 0;
case PR_GET_NAME:
get_task_comm(comm, me);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e8bffbe2ba4b..6318b511afa1 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
{ CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" },
{ CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" },
{ CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" },
- { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" },
+ /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
{ CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" },
{ CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" },
{ CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" },
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c44b407..eb98e55196b9 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
unsigned long flags;
- cpumask_clear_cpu(cpu, nohz_cpu_mask);
ts->idle_waketime = now;
local_irq_save(flags);
@@ -389,9 +388,6 @@ void tick_nohz_stop_sched_tick(int inidle)
else
expires.tv64 = KTIME_MAX;
- if (delta_jiffies > 1)
- cpumask_set_cpu(cpu, nohz_cpu_mask);
-
/* Skip reprogram of event if its not changed */
if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
goto out;
@@ -441,7 +437,6 @@ void tick_nohz_stop_sched_tick(int inidle)
* softirq.
*/
tick_do_update_jiffies64(ktime_get());
- cpumask_clear_cpu(cpu, nohz_cpu_mask);
}
raise_softirq_irqoff(TIMER_SOFTIRQ);
out:
@@ -524,7 +519,6 @@ void tick_nohz_restart_sched_tick(void)
/* Update jiffies first */
select_nohz_load_balancer(0);
tick_do_update_jiffies64(now);
- cpumask_clear_cpu(cpu, nohz_cpu_mask);
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
/*
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd8..0b537f27b559 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
/*
* Spinlock protecting the tables - not taken during lookup:
*/
-static DEFINE_SPINLOCK(table_lock);
+static DEFINE_RAW_SPINLOCK(table_lock);
/*
* Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
prev = NULL;
curr = *head;
- spin_lock(&table_lock);
+ raw_spin_lock(&table_lock);
/*
* Make sure we have not raced with another CPU:
*/
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
*head = curr;
}
out_unlock:
- spin_unlock(&table_lock);
+ raw_spin_unlock(&table_lock);
return curr;
}
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c5..5f39a07fe5ea 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+CFLAGS_trace_events_filter.o := -I$(src)
+
#
# Make the trace clocks available generally: it's infrastructure
# relied on by ptrace for example:
@@ -53,6 +55,9 @@ endif
obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+ifeq ($(CONFIG_PM_RUNTIME),y)
+obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
+endif
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e7829..077d85387908 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3863,6 +3863,14 @@ void ftrace_kill(void)
}
/**
+ * Test if ftrace is dead or not.
+ */
+int ftrace_is_dead(void)
+{
+ return ftrace_disabled;
+}
+
+/**
* register_ftrace_function - register a function for profiling
* @ops - ops structure that holds the function for profiling.
*
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4acc..f5b7b5c1195b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
int cpu;
atomic_t record_disabled;
struct ring_buffer *buffer;
- spinlock_t reader_lock; /* serialize readers */
+ raw_spinlock_t reader_lock; /* serialize readers */
arch_spinlock_t lock;
struct lock_class_key lock_key;
struct list_head *pages;
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
struct buffer_page *reader_page;
unsigned long lost_events;
unsigned long last_overrun;
+ local_t entries_bytes;
local_t commit_overrun;
local_t overrun;
local_t entries;
local_t committing;
local_t commits;
unsigned long read;
+ unsigned long read_bytes;
u64 write_stamp;
u64 read_stamp;
};
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
cpu_buffer->cpu = cpu;
cpu_buffer->buffer = buffer;
- spin_lock_init(&cpu_buffer->reader_lock);
+ raw_spin_lock_init(&cpu_buffer->reader_lock);
lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
struct list_head *p;
unsigned i;
- spin_lock_irq(&cpu_buffer->reader_lock);
+ raw_spin_lock_irq(&cpu_buffer->reader_lock);
rb_head_page_deactivate(cpu_buffer);
for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
rb_check_pages(cpu_buffer);
out:
- spin_unlock_irq(&cpu_buffer->reader_lock);
+ raw_spin_unlock_irq(&cpu_buffer->reader_lock);
}
static void
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
struct list_head *p;
unsigned i;
- spin_lock_irq(&cpu_buffer->reader_lock);
+ raw_spin_lock_irq(&cpu_buffer->reader_lock);
rb_head_page_deactivate(cpu_buffer);
for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
rb_check_pages(cpu_buffer);
out:
- spin_unlock_irq(&cpu_buffer->reader_lock);
+ raw_spin_unlock_irq(&cpu_buffer->reader_lock);
}
/**
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* the counters.
*/
local_add(entries, &cpu_buffer->overrun);
+ local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
/*
* The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
kmemcheck_annotate_bitfield(event, bitfield);
+ /* account for padding bytes */
+ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+
/*
* Save the original length to the meta data.
* This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (!tail)
tail_page->page->time_stamp = ts;
+ /* account for these added bytes */
+ local_add(length, &cpu_buffer->entries_bytes);
+
return event;
}
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
unsigned long write_mask =
local_read(&bpage->write) & ~RB_WRITE_MASK;
+ unsigned long event_length = rb_event_length(event);
/*
* This is on the tail page. It is possible that
* a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
old_index += write_mask;
new_index += write_mask;
index = local_cmpxchg(&bpage->write, old_index, new_index);
- if (index == old_index)
+ if (index == old_index) {
+ /* update counters */
+ local_sub(event_length, &cpu_buffer->entries_bytes);
return 1;
+ }
}
/* could not discard */
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
}
/**
+ * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+{
+ unsigned long flags;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ /*
+ * if the tail is on reader_page, oldest time stamp is on the reader
+ * page
+ */
+ if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+ bpage = cpu_buffer->reader_page;
+ else
+ bpage = rb_set_head_page(cpu_buffer);
+ ret = bpage->page->time_stamp;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
+
+/**
+ * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long ret;
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
+
+/**
* ring_buffer_entries_cpu - get the number of entries in a cpu buffer
* @buffer: The ring buffer
* @cpu: The per CPU buffer to get the entries from.
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
rb_iter_reset(iter);
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
again:
local_irq_save(flags);
if (dolock)
- spin_lock(&cpu_buffer->reader_lock);
+ raw_spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
rb_advance_reader(cpu_buffer);
if (dolock)
- spin_unlock(&cpu_buffer->reader_lock);
+ raw_spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
unsigned long flags;
again:
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
event = rb_iter_peek(iter, ts);
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
if (event && event->type_len == RINGBUF_TYPE_PADDING)
goto again;
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
if (dolock)
- spin_lock(&cpu_buffer->reader_lock);
+ raw_spin_lock(&cpu_buffer->reader_lock);
event = rb_buffer_peek(cpu_buffer, ts, lost_events);
if (event) {
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
}
if (dolock)
- spin_unlock(&cpu_buffer->reader_lock);
+ raw_spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
out:
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
cpu_buffer = iter->cpu_buffer;
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
arch_spin_lock(&cpu_buffer->lock);
rb_iter_reset(iter);
arch_spin_unlock(&cpu_buffer->lock);
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
}
EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
unsigned long flags;
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
again:
event = rb_iter_peek(iter, ts);
if (!event)
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
rb_advance_iter(iter);
out:
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
return event;
}
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->reader_page->read = 0;
local_set(&cpu_buffer->commit_overrun, 0);
+ local_set(&cpu_buffer->entries_bytes, 0);
local_set(&cpu_buffer->overrun, 0);
local_set(&cpu_buffer->entries, 0);
local_set(&cpu_buffer->committing, 0);
local_set(&cpu_buffer->commits, 0);
cpu_buffer->read = 0;
+ cpu_buffer->read_bytes = 0;
cpu_buffer->write_stamp = 0;
cpu_buffer->read_stamp = 0;
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
atomic_inc(&cpu_buffer->record_disabled);
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
goto out;
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
arch_spin_unlock(&cpu_buffer->lock);
out:
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
atomic_dec(&cpu_buffer->record_disabled);
}
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
if (dolock)
- spin_lock(&cpu_buffer->reader_lock);
+ raw_spin_lock(&cpu_buffer->reader_lock);
ret = rb_per_cpu_empty(cpu_buffer);
if (dolock)
- spin_unlock(&cpu_buffer->reader_lock);
+ raw_spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
if (!ret)
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
if (dolock)
- spin_lock(&cpu_buffer->reader_lock);
+ raw_spin_lock(&cpu_buffer->reader_lock);
ret = rb_per_cpu_empty(cpu_buffer);
if (dolock)
- spin_unlock(&cpu_buffer->reader_lock);
+ raw_spin_unlock(&cpu_buffer->reader_lock);
local_irq_restore(flags);
return ret;
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
if (!bpage)
goto out;
- spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
reader = rb_get_reader_page(cpu_buffer);
if (!reader)
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
} else {
/* update the entry counter */
cpu_buffer->read += rb_page_entries(reader);
+ cpu_buffer->read_bytes += BUF_PAGE_SIZE;
/* swap the pages */
rb_init_page(bpage);
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
out_unlock:
- spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
out:
return ret;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 000000000000..4b3b5eaf94d1
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpm.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1d..f2bd275bb60f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
static int trace_stop_count;
-static DEFINE_SPINLOCK(tracing_start_lock);
+static DEFINE_RAW_SPINLOCK(tracing_start_lock);
static void wakeup_work_handler(struct work_struct *work)
{
@@ -435,6 +435,7 @@ static struct {
} trace_clocks[] = {
{ trace_clock_local, "local" },
{ trace_clock_global, "global" },
+ { trace_clock_counter, "counter" },
};
int trace_clock_id;
@@ -960,7 +961,7 @@ void tracing_start(void)
if (tracing_disabled)
return;
- spin_lock_irqsave(&tracing_start_lock, flags);
+ raw_spin_lock_irqsave(&tracing_start_lock, flags);
if (--trace_stop_count) {
if (trace_stop_count < 0) {
/* Someone screwed up their debugging */
@@ -985,7 +986,7 @@ void tracing_start(void)
ftrace_start();
out:
- spin_unlock_irqrestore(&tracing_start_lock, flags);
+ raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
}
/**
@@ -1000,7 +1001,7 @@ void tracing_stop(void)
unsigned long flags;
ftrace_stop();
- spin_lock_irqsave(&tracing_start_lock, flags);
+ raw_spin_lock_irqsave(&tracing_start_lock, flags);
if (trace_stop_count++)
goto out;
@@ -1018,7 +1019,7 @@ void tracing_stop(void)
arch_spin_unlock(&ftrace_max_lock);
out:
- spin_unlock_irqrestore(&tracing_start_lock, flags);
+ raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
}
void trace_stop_cmdline_recording(void);
@@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m)
}
}
+static void test_ftrace_alive(struct seq_file *m)
+{
+ if (!ftrace_is_dead())
+ return;
+ seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
+}
+
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v)
if (iter->tr) {
seq_printf(m, "# tracer: %s\n", iter->trace->name);
seq_puts(m, "#\n");
+ test_ftrace_alive(m);
}
if (iter->trace && iter->trace->print_header)
iter->trace->print_header(m);
@@ -2710,9 +2720,9 @@ static const char readme_msg[] =
"# cat /sys/kernel/debug/tracing/trace_options\n"
"noprint-parent nosym-offset nosym-addr noverbose\n"
"# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
- "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+ "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
"# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
- "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
+ "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
;
static ssize_t
@@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
+tracing_total_entries_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ char buf[64];
+ int r, cpu;
+ unsigned long size = 0, expanded_size = 0;
+
+ mutex_lock(&trace_types_lock);
+ for_each_tracing_cpu(cpu) {
+ size += tr->entries >> 10;
+ if (!ring_buffer_expanded)
+ expanded_size += trace_buf_size >> 10;
+ }
+ if (ring_buffer_expanded)
+ r = sprintf(buf, "%lu\n", size);
+ else
+ r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
+ mutex_unlock(&trace_types_lock);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
return 0;
}
-static int mark_printk(const char *fmt, ...)
-{
- int ret;
- va_list args;
- va_start(args, fmt);
- ret = trace_vprintk(0, fmt, args);
- va_end(args);
- return ret;
-}
-
static ssize_t
tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
- char *buf;
- size_t written;
+ unsigned long addr = (unsigned long)ubuf;
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ struct page *pages[2];
+ int nr_pages = 1;
+ ssize_t written;
+ void *page1;
+ void *page2;
+ int offset;
+ int size;
+ int len;
+ int ret;
if (tracing_disabled)
return -EINVAL;
@@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (cnt > TRACE_BUF_SIZE)
cnt = TRACE_BUF_SIZE;
- buf = kmalloc(cnt + 2, GFP_KERNEL);
- if (buf == NULL)
- return -ENOMEM;
+ /*
+ * Userspace is injecting traces into the kernel trace buffer.
+ * We want to be as non intrusive as possible.
+ * To do so, we do not want to allocate any special buffers
+ * or take any locks, but instead write the userspace data
+ * straight into the ring buffer.
+ *
+ * First we need to pin the userspace buffer into memory,
+ * which, most likely it is, because it just referenced it.
+ * But there's no guarantee that it is. By using get_user_pages_fast()
+ * and kmap_atomic/kunmap_atomic() we can get access to the
+ * pages directly. We then write the data directly into the
+ * ring buffer.
+ */
+ BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
- if (copy_from_user(buf, ubuf, cnt)) {
- kfree(buf);
- return -EFAULT;
+ /* check if we cross pages */
+ if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
+ nr_pages = 2;
+
+ offset = addr & (PAGE_SIZE - 1);
+ addr &= PAGE_MASK;
+
+ ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+ if (ret < nr_pages) {
+ while (--ret >= 0)
+ put_page(pages[ret]);
+ written = -EFAULT;
+ goto out;
+ }
+
+ page1 = kmap_atomic(pages[0]);
+ if (nr_pages == 2)
+ page2 = kmap_atomic(pages[1]);
+
+ local_save_flags(irq_flags);
+ size = sizeof(*entry) + cnt + 2; /* possible \n added */
+ buffer = global_trace.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+ irq_flags, preempt_count());
+ if (!event) {
+ /* Ring buffer disabled, return as if not open for write */
+ written = -EBADF;
+ goto out_unlock;
}
- if (buf[cnt-1] != '\n') {
- buf[cnt] = '\n';
- buf[cnt+1] = '\0';
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = _THIS_IP_;
+
+ if (nr_pages == 2) {
+ len = PAGE_SIZE - offset;
+ memcpy(&entry->buf, page1 + offset, len);
+ memcpy(&entry->buf[len], page2, cnt - len);
} else
- buf[cnt] = '\0';
+ memcpy(&entry->buf, page1 + offset, cnt);
- written = mark_printk("%s", buf);
- kfree(buf);
- *fpos += written;
+ if (entry->buf[cnt - 1] != '\n') {
+ entry->buf[cnt] = '\n';
+ entry->buf[cnt + 1] = '\0';
+ } else
+ entry->buf[cnt] = '\0';
+
+ ring_buffer_unlock_commit(buffer, event);
- /* don't tell userspace we wrote more - it might confuse them */
- if (written > cnt)
- written = cnt;
+ written = cnt;
+ *fpos += written;
+
+ out_unlock:
+ if (nr_pages == 2)
+ kunmap_atomic(page2);
+ kunmap_atomic(page1);
+ while (nr_pages > 0)
+ put_page(pages[--nr_pages]);
+ out:
return written;
}
@@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = {
.llseek = generic_file_llseek,
};
+static const struct file_operations tracing_total_entries_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_total_entries_read,
+ .llseek = generic_file_llseek,
+};
+
static const struct file_operations tracing_free_buffer_fops = {
.write = tracing_free_buffer_write,
.release = tracing_free_buffer_release,
@@ -3808,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (info->read < PAGE_SIZE)
goto read;
- info->read = 0;
-
trace_access_lock(info->cpu);
ret = ring_buffer_read_page(info->tr->buffer,
&info->spare,
@@ -3819,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (ret < 0)
return 0;
+ info->read = 0;
+
read:
size = PAGE_SIZE - info->read;
if (size > count)
@@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
struct trace_array *tr = &global_trace;
struct trace_seq *s;
unsigned long cnt;
+ unsigned long long t;
+ unsigned long usec_rem;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
@@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+ cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+ trace_seq_printf(s, "bytes: %ld\n", cnt);
+
+ t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+ usec_rem = do_div(t, USEC_PER_SEC);
+ trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+
+ t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+ usec_rem = do_div(t, USEC_PER_SEC);
+ trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
+
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
kfree(s);
@@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void)
trace_create_file("buffer_size_kb", 0644, d_tracer,
&global_trace, &tracing_entries_fops);
+ trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+ &global_trace, &tracing_total_entries_fops);
+
trace_create_file("free_buffer", 0644, d_tracer,
&global_trace, &tracing_free_buffer_fops);
@@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
tracing_off();
+ /* Did function tracer already get disabled? */
+ if (ftrace_is_dead()) {
+ printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ printk("# MAY BE MISSING FUNCTION EVENTS\n");
+ }
+
if (disable_tracing)
ftrace_kill();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee5..092e1f8d18dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
return test_tsk_trace_trace(task);
}
+extern int ftrace_is_dead(void);
#else
static inline int ftrace_trace_task(struct task_struct *task)
{
return 1;
}
+static inline int ftrace_is_dead(void) { return 0; }
#endif
/*
@@ -761,16 +763,10 @@ struct filter_pred {
filter_pred_fn_t fn;
u64 val;
struct regex regex;
- /*
- * Leaf nodes use field_name, ops is used by AND and OR
- * nodes. The field_name is always freed when freeing a pred.
- * We can overload field_name for ops and have it freed
- * as well.
- */
- union {
- char *field_name;
- unsigned short *ops;
- };
+ unsigned short *ops;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ struct ftrace_event_field *field;
+#endif
int offset;
int not;
int op;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a1398..394783531cbb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
return now;
}
+
+static atomic64_t trace_counter;
+
+/*
+ * trace_clock_counter(): simply an atomic counter.
+ * Use the trace_counter "counter" for cases where you do not care
+ * about timings, but are interested in strict ordering.
+ */
+u64 notrace trace_clock_counter(void)
+{
+ return atomic64_add_return(1, &trace_counter);
+}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd6..816d3d074979 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
return pred;
}
+enum walk_return {
+ WALK_PRED_ABORT,
+ WALK_PRED_PARENT,
+ WALK_PRED_DEFAULT,
+};
+
+typedef int (*filter_pred_walkcb_t) (enum move_type move,
+ struct filter_pred *pred,
+ int *err, void *data);
+
+static int walk_pred_tree(struct filter_pred *preds,
+ struct filter_pred *root,
+ filter_pred_walkcb_t cb, void *data)
+{
+ struct filter_pred *pred = root;
+ enum move_type move = MOVE_DOWN;
+ int done = 0;
+
+ if (!preds)
+ return -EINVAL;
+
+ do {
+ int err = 0, ret;
+
+ ret = cb(move, pred, &err, data);
+ if (ret == WALK_PRED_ABORT)
+ return err;
+ if (ret == WALK_PRED_PARENT)
+ goto get_parent;
+
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ goto get_parent;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ get_parent:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent,
+ &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ /* We are fine. */
+ return 0;
+}
+
/*
* A series of AND or ORs where found together. Instead of
* climbing up and down the tree branches, an array of the
@@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds,
for (i = 0; i < op->val; i++) {
pred = &preds[op->ops[i]];
- match = pred->fn(pred, rec);
+ if (!WARN_ON_ONCE(!pred->fn))
+ match = pred->fn(pred, rec);
if (!!match == type)
return match;
}
return match;
}
+struct filter_match_preds_data {
+ struct filter_pred *preds;
+ int match;
+ void *rec;
+};
+
+static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct filter_match_preds_data *d = data;
+
+ *err = 0;
+ switch (move) {
+ case MOVE_DOWN:
+ /* only AND and OR have children */
+ if (pred->left != FILTER_PRED_INVALID) {
+ /* If ops is set, then it was folded. */
+ if (!pred->ops)
+ return WALK_PRED_DEFAULT;
+ /* We can treat folded ops as a leaf node */
+ d->match = process_ops(d->preds, pred, d->rec);
+ } else {
+ if (!WARN_ON_ONCE(!pred->fn))
+ d->match = pred->fn(pred, d->rec);
+ }
+
+ return WALK_PRED_PARENT;
+ case MOVE_UP_FROM_LEFT:
+ /*
+ * Check for short circuits.
+ *
+ * Optimization: !!match == (pred->op == OP_OR)
+ * is the same as:
+ * if ((match && pred->op == OP_OR) ||
+ * (!match && pred->op == OP_AND))
+ */
+ if (!!d->match == (pred->op == OP_OR))
+ return WALK_PRED_PARENT;
+ break;
+ case MOVE_UP_FROM_RIGHT:
+ break;
+ }
+
+ return WALK_PRED_DEFAULT;
+}
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
- int match = -1;
- enum move_type move = MOVE_DOWN;
struct filter_pred *preds;
- struct filter_pred *pred;
struct filter_pred *root;
- int n_preds;
- int done = 0;
+ struct filter_match_preds_data data = {
+ /* match is currently meaningless */
+ .match = -1,
+ .rec = rec,
+ };
+ int n_preds, ret;
/* no filter is considered a match */
if (!filter)
return 1;
n_preds = filter->n_preds;
-
if (!n_preds)
return 1;
/*
* n_preds, root and filter->preds are protect with preemption disabled.
*/
- preds = rcu_dereference_sched(filter->preds);
root = rcu_dereference_sched(filter->root);
if (!root)
return 1;
- pred = root;
-
- /* match is currently meaningless */
- match = -1;
-
- do {
- switch (move) {
- case MOVE_DOWN:
- /* only AND and OR have children */
- if (pred->left != FILTER_PRED_INVALID) {
- /* If ops is set, then it was folded. */
- if (!pred->ops) {
- /* keep going to down the left side */
- pred = &preds[pred->left];
- continue;
- }
- /* We can treat folded ops as a leaf node */
- match = process_ops(preds, pred, rec);
- } else
- match = pred->fn(pred, rec);
- /* If this pred is the only pred */
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- case MOVE_UP_FROM_LEFT:
- /*
- * Check for short circuits.
- *
- * Optimization: !!match == (pred->op == OP_OR)
- * is the same as:
- * if ((match && pred->op == OP_OR) ||
- * (!match && pred->op == OP_AND))
- */
- if (!!match == (pred->op == OP_OR)) {
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- }
- /* now go down the right side of the tree. */
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- /* We finished this equation. */
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- }
- done = 1;
- } while (!done);
-
- return match;
+ data.preds = preds = rcu_dereference_sched(filter->preds);
+ ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
+ WARN_ON(ret);
+ return data.match;
}
EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
return __find_event_field(head, name);
}
-static void filter_free_pred(struct filter_pred *pred)
-{
- if (!pred)
- return;
-
- kfree(pred->field_name);
- kfree(pred);
-}
-
-static void filter_clear_pred(struct filter_pred *pred)
-{
- kfree(pred->field_name);
- pred->field_name = NULL;
- pred->regex.len = 0;
-}
-
static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
{
stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack)
static int filter_set_pred(struct event_filter *filter,
int idx,
struct pred_stack *stack,
- struct filter_pred *src,
- filter_pred_fn_t fn)
+ struct filter_pred *src)
{
struct filter_pred *dest = &filter->preds[idx];
struct filter_pred *left;
struct filter_pred *right;
*dest = *src;
- if (src->field_name) {
- dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
- if (!dest->field_name)
- return -ENOMEM;
- }
- dest->fn = fn;
dest->index = idx;
if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter,
static void __free_preds(struct event_filter *filter)
{
- int i;
-
if (filter->preds) {
- for (i = 0; i < filter->a_preds; i++)
- kfree(filter->preds[i].field_name);
kfree(filter->preds);
filter->preds = NULL;
}
@@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
}
}
-static int filter_add_pred_fn(struct filter_parse_state *ps,
- struct ftrace_event_call *call,
- struct event_filter *filter,
- struct filter_pred *pred,
- struct pred_stack *stack,
- filter_pred_fn_t fn)
+static int filter_add_pred(struct filter_parse_state *ps,
+ struct event_filter *filter,
+ struct filter_pred *pred,
+ struct pred_stack *stack)
{
- int idx, err;
+ int err;
if (WARN_ON(filter->n_preds == filter->a_preds)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
return -ENOSPC;
}
- idx = filter->n_preds;
- filter_clear_pred(&filter->preds[idx]);
- err = filter_set_pred(filter, idx, stack, pred, fn);
+ err = filter_set_pred(filter, filter->n_preds, stack, pred);
if (err)
return err;
@@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
return fn;
}
-static int filter_add_pred(struct filter_parse_state *ps,
- struct ftrace_event_call *call,
- struct event_filter *filter,
- struct filter_pred *pred,
- struct pred_stack *stack,
- bool dry_run)
+static int init_pred(struct filter_parse_state *ps,
+ struct ftrace_event_field *field,
+ struct filter_pred *pred)
+
{
- struct ftrace_event_field *field;
- filter_pred_fn_t fn;
+ filter_pred_fn_t fn = filter_pred_none;
unsigned long long val;
int ret;
- fn = pred->fn = filter_pred_none;
-
- if (pred->op == OP_AND)
- goto add_pred_fn;
- else if (pred->op == OP_OR)
- goto add_pred_fn;
-
- field = find_event_field(call, pred->field_name);
- if (!field) {
- parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
- return -EINVAL;
- }
-
pred->offset = field->offset;
if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
if (pred->op == OP_NE)
pred->not = 1;
-add_pred_fn:
- if (!dry_run)
- return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
+ pred->fn = fn;
return 0;
}
@@ -1302,39 +1302,37 @@ parse_operand:
return 0;
}
-static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+static struct filter_pred *create_pred(struct filter_parse_state *ps,
+ struct ftrace_event_call *call,
+ int op, char *operand1, char *operand2)
{
- struct filter_pred *pred;
+ struct ftrace_event_field *field;
+ static struct filter_pred pred;
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
- if (!pred)
- return NULL;
+ memset(&pred, 0, sizeof(pred));
+ pred.op = op;
- pred->field_name = kstrdup(operand1, GFP_KERNEL);
- if (!pred->field_name) {
- kfree(pred);
+ if (op == OP_AND || op == OP_OR)
+ return &pred;
+
+ if (!operand1 || !operand2) {
+ parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
return NULL;
}
- strcpy(pred->regex.pattern, operand2);
- pred->regex.len = strlen(pred->regex.pattern);
-
- pred->op = op;
-
- return pred;
-}
-
-static struct filter_pred *create_logical_pred(int op)
-{
- struct filter_pred *pred;
-
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
- if (!pred)
+ field = find_event_field(call, operand1);
+ if (!field) {
+ parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
return NULL;
+ }
- pred->op = op;
+ strcpy(pred.regex.pattern, operand2);
+ pred.regex.len = strlen(pred.regex.pattern);
- return pred;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+ pred.field = field;
+#endif
+ return init_pred(ps, field, &pred) ? NULL : &pred;
}
static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps)
return n_preds;
}
+struct check_pred_data {
+ int count;
+ int max;
+};
+
+static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct check_pred_data *d = data;
+
+ if (WARN_ON(d->count++ > d->max)) {
+ *err = -EINVAL;
+ return WALK_PRED_ABORT;
+ }
+ return WALK_PRED_DEFAULT;
+}
+
/*
* The tree is walked at filtering of an event. If the tree is not correctly
* built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps)
static int check_pred_tree(struct event_filter *filter,
struct filter_pred *root)
{
- struct filter_pred *preds;
- struct filter_pred *pred;
- enum move_type move = MOVE_DOWN;
- int count = 0;
- int done = 0;
- int max;
-
- /*
- * The max that we can hit a node is three times.
- * Once going down, once coming up from left, and
- * once coming up from right. This is more than enough
- * since leafs are only hit a single time.
- */
- max = 3 * filter->n_preds;
+ struct check_pred_data data = {
+ /*
+ * The max that we can hit a node is three times.
+ * Once going down, once coming up from left, and
+ * once coming up from right. This is more than enough
+ * since leafs are only hit a single time.
+ */
+ .max = 3 * filter->n_preds,
+ .count = 0,
+ };
- preds = filter->preds;
- if (!preds)
- return -EINVAL;
- pred = root;
+ return walk_pred_tree(filter->preds, root,
+ check_pred_tree_cb, &data);
+}
- do {
- if (WARN_ON(count++ > max))
- return -EINVAL;
+static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ int *count = data;
- switch (move) {
- case MOVE_DOWN:
- if (pred->left != FILTER_PRED_INVALID) {
- pred = &preds[pred->left];
- continue;
- }
- /* A leaf at the root is just a leaf in the tree */
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- case MOVE_UP_FROM_LEFT:
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- }
- done = 1;
- } while (!done);
+ if ((move == MOVE_DOWN) &&
+ (pred->left == FILTER_PRED_INVALID))
+ (*count)++;
- /* We are fine. */
- return 0;
+ return WALK_PRED_DEFAULT;
}
static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
{
- struct filter_pred *pred;
- enum move_type move = MOVE_DOWN;
- int count = 0;
- int done = 0;
+ int count = 0, ret;
- pred = root;
+ ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+ WARN_ON(ret);
+ return count;
+}
- do {
- switch (move) {
- case MOVE_DOWN:
- if (pred->left != FILTER_PRED_INVALID) {
- pred = &preds[pred->left];
- continue;
- }
- /* A leaf at the root is just a leaf in the tree */
- if (pred == root)
- return 1;
- count++;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- case MOVE_UP_FROM_LEFT:
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- }
- done = 1;
- } while (!done);
+struct fold_pred_data {
+ struct filter_pred *root;
+ int count;
+ int children;
+};
- return count;
+static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct fold_pred_data *d = data;
+ struct filter_pred *root = d->root;
+
+ if (move != MOVE_DOWN)
+ return WALK_PRED_DEFAULT;
+ if (pred->left != FILTER_PRED_INVALID)
+ return WALK_PRED_DEFAULT;
+
+ if (WARN_ON(d->count == d->children)) {
+ *err = -EINVAL;
+ return WALK_PRED_ABORT;
+ }
+
+ pred->index &= ~FILTER_PRED_FOLD;
+ root->ops[d->count++] = pred->index;
+ return WALK_PRED_DEFAULT;
}
static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
{
- struct filter_pred *pred;
- enum move_type move = MOVE_DOWN;
- int count = 0;
+ struct fold_pred_data data = {
+ .root = root,
+ .count = 0,
+ };
int children;
- int done = 0;
/* No need to keep the fold flag */
root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
return -ENOMEM;
root->val = children;
+ data.children = children;
+ return walk_pred_tree(preds, root, fold_pred_cb, &data);
+}
- pred = root;
- do {
- switch (move) {
- case MOVE_DOWN:
- if (pred->left != FILTER_PRED_INVALID) {
- pred = &preds[pred->left];
- continue;
- }
- if (WARN_ON(count == children))
- return -EINVAL;
- pred->index &= ~FILTER_PRED_FOLD;
- root->ops[count++] = pred->index;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- case MOVE_UP_FROM_LEFT:
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- }
- done = 1;
- } while (!done);
+static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ struct filter_pred *preds = data;
- return 0;
+ if (move != MOVE_DOWN)
+ return WALK_PRED_DEFAULT;
+ if (!(pred->index & FILTER_PRED_FOLD))
+ return WALK_PRED_DEFAULT;
+
+ *err = fold_pred(preds, pred);
+ if (*err)
+ return WALK_PRED_ABORT;
+
+ /* eveyrhing below is folded, continue with parent */
+ return WALK_PRED_PARENT;
}
/*
@@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
static int fold_pred_tree(struct event_filter *filter,
struct filter_pred *root)
{
- struct filter_pred *preds;
- struct filter_pred *pred;
- enum move_type move = MOVE_DOWN;
- int done = 0;
- int err;
-
- preds = filter->preds;
- if (!preds)
- return -EINVAL;
- pred = root;
-
- do {
- switch (move) {
- case MOVE_DOWN:
- if (pred->index & FILTER_PRED_FOLD) {
- err = fold_pred(preds, pred);
- if (err)
- return err;
- /* Folded nodes are like leafs */
- } else if (pred->left != FILTER_PRED_INVALID) {
- pred = &preds[pred->left];
- continue;
- }
-
- /* A leaf at the root is just a leaf in the tree */
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- case MOVE_UP_FROM_LEFT:
- pred = &preds[pred->right];
- move = MOVE_DOWN;
- continue;
- case MOVE_UP_FROM_RIGHT:
- if (pred == root)
- break;
- pred = get_pred_parent(pred, preds,
- pred->parent, &move);
- continue;
- }
- done = 1;
- } while (!done);
-
- return 0;
+ return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
+ filter->preds);
}
static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call,
goto fail;
}
- if (elt->op == OP_AND || elt->op == OP_OR) {
- pred = create_logical_pred(elt->op);
- goto add_pred;
- }
-
- if (!operand1 || !operand2) {
- parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+ pred = create_pred(ps, call, elt->op, operand1, operand2);
+ if (!pred) {
err = -EINVAL;
goto fail;
}
- pred = create_pred(elt->op, operand1, operand2);
-add_pred:
- if (!pred) {
- err = -ENOMEM;
- goto fail;
+ if (!dry_run) {
+ err = filter_add_pred(ps, filter, pred, &stack);
+ if (err)
+ goto fail;
}
- err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
- filter_free_pred(pred);
- if (err)
- goto fail;
operand1 = operand2 = NULL;
}
@@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
int err;
struct event_filter *filter;
struct filter_parse_state *ps;
- struct ftrace_event_call *call = NULL;
+ struct ftrace_event_call *call;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
- if (call->event.type == event_id)
- break;
- }
+ call = event->tp_event;
err = -EINVAL;
- if (&call->list == &ftrace_events)
+ if (!call)
goto out_unlock;
err = -EEXIST;
@@ -2012,3 +1929,215 @@ out_unlock:
#endif /* CONFIG_PERF_EVENTS */
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_events_filter_test.h"
+
+static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
+ struct event_filter **pfilter)
+{
+ struct event_filter *filter;
+ struct filter_parse_state *ps;
+ int err = -ENOMEM;
+
+ filter = __alloc_filter();
+ if (!filter)
+ goto out;
+
+ ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+ if (!ps)
+ goto free_filter;
+
+ parse_init(ps, filter_ops, filter_str);
+ err = filter_parse(ps);
+ if (err)
+ goto free_ps;
+
+ err = replace_preds(call, filter, ps, filter_str, false);
+ if (!err)
+ *pfilter = filter;
+
+ free_ps:
+ filter_opstack_clear(ps);
+ postfix_clear(ps);
+ kfree(ps);
+
+ free_filter:
+ if (err)
+ __free_filter(filter);
+
+ out:
+ return err;
+}
+
+#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
+{ \
+ .filter = FILTER, \
+ .rec = { .a = va, .b = vb, .c = vc, .d = vd, \
+ .e = ve, .f = vf, .g = vg, .h = vh }, \
+ .match = m, \
+ .not_visited = nvisit, \
+}
+#define YES 1
+#define NO 0
+
+static struct test_filter_data_t {
+ char *filter;
+ struct ftrace_raw_ftrace_test_filter rec;
+ int match;
+ char *not_visited;
+} test_filter_data[] = {
+#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
+ "e == 1 && f == 1 && g == 1 && h == 1"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
+ DATA_REC(NO, 0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
+ DATA_REC(NO, 1, 1, 1, 1, 1, 1, 1, 0, ""),
+#undef FILTER
+#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
+ "e == 1 || f == 1 || g == 1 || h == 1"
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
+ DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+ DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
+#undef FILTER
+#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
+ "(e == 1 || f == 1) && (g == 1 || h == 1)"
+ DATA_REC(NO, 0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
+ DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+ DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
+ DATA_REC(NO, 1, 0, 1, 0, 0, 1, 0, 0, "bd"),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
+ "(e == 1 && f == 1) || (g == 1 && h == 1)"
+ DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
+ DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
+ "(e == 1 && f == 1) || (g == 1 && h == 1)"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+ DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
+#undef FILTER
+#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
+ "(e == 1 || f == 1)) && (g == 1 || h == 1)"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
+ DATA_REC(NO, 0, 0, 0, 0, 0, 0, 0, 0, ""),
+ DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
+#undef FILTER
+#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
+ "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
+ DATA_REC(NO, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+ DATA_REC(NO, 1, 0, 1, 0, 1, 0, 1, 0, ""),
+#undef FILTER
+#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
+ "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
+ DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
+ DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+ DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
+};
+
+#undef DATA_REC
+#undef FILTER
+#undef YES
+#undef NO
+
+#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+
+static int test_pred_visited;
+
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+ struct ftrace_event_field *field = pred->field;
+
+ test_pred_visited = 1;
+ printk(KERN_INFO "\npred visited %s\n", field->name);
+ return 1;
+}
+
+static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
+ int *err, void *data)
+{
+ char *fields = data;
+
+ if ((move == MOVE_DOWN) &&
+ (pred->left == FILTER_PRED_INVALID)) {
+ struct ftrace_event_field *field = pred->field;
+
+ if (!field) {
+ WARN(1, "all leafs should have field defined");
+ return WALK_PRED_DEFAULT;
+ }
+ if (!strchr(fields, *field->name))
+ return WALK_PRED_DEFAULT;
+
+ WARN_ON(!pred->fn);
+ pred->fn = test_pred_visited_fn;
+ }
+ return WALK_PRED_DEFAULT;
+}
+
+static __init int ftrace_test_event_filter(void)
+{
+ int i;
+
+ printk(KERN_INFO "Testing ftrace filter: ");
+
+ for (i = 0; i < DATA_CNT; i++) {
+ struct event_filter *filter = NULL;
+ struct test_filter_data_t *d = &test_filter_data[i];
+ int err;
+
+ err = test_get_filter(d->filter, &event_ftrace_test_filter,
+ &filter);
+ if (err) {
+ printk(KERN_INFO
+ "Failed to get filter for '%s', err %d\n",
+ d->filter, err);
+ break;
+ }
+
+ /*
+ * The preemption disabling is not really needed for self
+ * tests, but the rcu dereference will complain without it.
+ */
+ preempt_disable();
+ if (*d->not_visited)
+ walk_pred_tree(filter->preds, filter->root,
+ test_walk_pred_cb,
+ d->not_visited);
+
+ test_pred_visited = 0;
+ err = filter_match_preds(filter, &d->rec);
+ preempt_enable();
+
+ __free_filter(filter);
+
+ if (test_pred_visited) {
+ printk(KERN_INFO
+ "Failed, unwanted pred visited for filter %s\n",
+ d->filter);
+ break;
+ }
+
+ if (err != d->match) {
+ printk(KERN_INFO
+ "Failed to match filter '%s', expected %d\n",
+ d->filter, d->match);
+ break;
+ }
+ }
+
+ if (i == DATA_CNT)
+ printk(KERN_CONT "OK\n");
+
+ return 0;
+}
+
+late_initcall(ftrace_test_event_filter);
+
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 000000000000..bfd4dba0d603
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM test
+
+#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEST_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(ftrace_test_filter,
+
+ TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
+
+ TP_ARGS(a, b, c, d, e, f, g, h),
+
+ TP_STRUCT__entry(
+ __field(int, a)
+ __field(int, b)
+ __field(int, c)
+ __field(int, d)
+ __field(int, e)
+ __field(int, f)
+ __field(int, g)
+ __field(int, h)
+ ),
+
+ TP_fast_assign(
+ __entry->a = a;
+ __entry->b = b;
+ __entry->c = c;
+ __entry->d = d;
+ __entry->e = e;
+ __entry->f = f;
+ __entry->g = g;
+ __entry->h = h;
+ ),
+
+ TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
+ __entry->a, __entry->b, __entry->c, __entry->d,
+ __entry->e, __entry->f, __entry->g, __entry->h)
+);
+
+#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_events_filter_test
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cfc..20dad0d7a163 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int tracer_enabled __read_mostly;
static DEFINE_PER_CPU(int, tracing_cpu);
-static DEFINE_SPINLOCK(max_trace_lock);
+static DEFINE_RAW_SPINLOCK(max_trace_lock);
enum {
TRACER_IRQS_OFF = (1 << 1),
@@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
if (!report_latency(delta))
goto out;
- spin_lock_irqsave(&max_trace_lock, flags);
+ raw_spin_lock_irqsave(&max_trace_lock, flags);
/* check if we are still the max latency */
if (!report_latency(delta))
@@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
max_sequence++;
out_unlock:
- spin_unlock_irqrestore(&max_trace_lock, flags);
+ raw_spin_unlock_irqrestore(&max_trace_lock, flags);
out:
data->critical_sequence = max_sequence;
@@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
#ifdef CONFIG_PREEMPT_TRACER
void trace_preempt_on(unsigned long a0, unsigned long a1)
{
- if (preempt_trace())
+ if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
void trace_preempt_off(unsigned long a0, unsigned long a1)
{
- if (preempt_trace())
+ if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
}
#endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e5..00d527c945a4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
}
/* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static void unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_probe(struct trace_probe *tp)
{
+ /* Enabled event can not be unregistered */
+ if (trace_probe_is_enabled(tp))
+ return -EBUSY;
+
__unregister_trace_probe(tp);
list_del(&tp->list);
unregister_probe_event(tp);
+
+ return 0;
}
/* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
/* Delete old (same name) event if exist */
old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
if (old_tp) {
- unregister_trace_probe(old_tp);
+ ret = unregister_trace_probe(old_tp);
+ if (ret < 0)
+ goto end;
free_trace_probe(old_tp);
}
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
mutex_lock(&probe_lock);
list_for_each_entry(tp, &probe_list, list) {
if (trace_probe_within_module(tp, mod)) {
+ /* Don't need to check busy - this should have gone. */
__unregister_trace_probe(tp);
ret = __register_trace_probe(tp);
if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
return -ENOENT;
}
/* delete an event */
- unregister_trace_probe(tp);
- free_trace_probe(tp);
+ ret = unregister_trace_probe(tp);
+ if (ret == 0)
+ free_trace_probe(tp);
mutex_unlock(&probe_lock);
- return 0;
+ return ret;
}
if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
return ret;
}
-static void release_all_trace_probes(void)
+static int release_all_trace_probes(void)
{
struct trace_probe *tp;
+ int ret = 0;
mutex_lock(&probe_lock);
+ /* Ensure no probe is in use. */
+ list_for_each_entry(tp, &probe_list, list)
+ if (trace_probe_is_enabled(tp)) {
+ ret = -EBUSY;
+ goto end;
+ }
/* TODO: Use batch unregistration */
while (!list_empty(&probe_list)) {
tp = list_entry(probe_list.next, struct trace_probe, list);
unregister_trace_probe(tp);
free_trace_probe(tp);
}
+
+end:
mutex_unlock(&probe_lock);
+
+ return ret;
}
/* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
static int probes_open(struct inode *inode, struct file *file)
{
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC))
- release_all_trace_probes();
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = release_all_trace_probes();
+ if (ret < 0)
+ return ret;
+ }
return seq_open(file, &probes_seq_op);
}
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
ret = target(1, 2, 3, 4, 5, 6);
+ /* Disable trace points before removing it */
+ tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tp == NULL)) {
+ pr_warning("error on getting test probe.\n");
+ warn++;
+ } else
+ disable_trace_probe(tp, TP_FLAG_TRACE);
+
+ tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+ if (WARN_ON_ONCE(tp == NULL)) {
+ pr_warning("error on getting 2nd test probe.\n");
+ warn++;
+ } else
+ disable_trace_probe(tp, TP_FLAG_TRACE);
+
ret = command_trace_probe("-:testprobe");
if (WARN_ON_ONCE(ret)) {
pr_warning("error on deleting a probe.\n");
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d7..6fd4ffd042f9 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
continue;
}
+ fmt = NULL;
tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
- if (tb_fmt)
+ if (tb_fmt) {
fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
- if (tb_fmt && fmt) {
- list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
- strcpy(fmt, *iter);
- tb_fmt->fmt = fmt;
- *iter = tb_fmt->fmt;
- } else {
- kfree(tb_fmt);
- *iter = NULL;
+ if (fmt) {
+ list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+ strcpy(fmt, *iter);
+ tb_fmt->fmt = fmt;
+ } else
+ kfree(tb_fmt);
}
+ *iter = fmt;
+
}
mutex_unlock(&btrace_mutex);
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c54..db110b8ae030 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
static const int tracepoint_debug;
/*
- * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
- * builtin and module tracepoints and the hash table.
+ * Tracepoints mutex protects the builtin and module tracepoints and the hash
+ * table, as well as the local module list.
*/
static DEFINE_MUTEX(tracepoints_mutex);
+#ifdef CONFIG_MODULES
+/* Local list of struct module */
+static LIST_HEAD(tracepoint_module_list);
+#endif /* CONFIG_MODULES */
+
/*
* Tracepoint hash table, containing the active tracepoints.
* Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
* @end: end of the range
*
* Updates the probe callback corresponding to a range of tracepoints.
+ * Called with tracepoints_mutex held.
*/
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
- struct tracepoint * const *end)
+static void tracepoint_update_probe_range(struct tracepoint * const *begin,
+ struct tracepoint * const *end)
{
struct tracepoint * const *iter;
struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
if (!begin)
return;
- mutex_lock(&tracepoints_mutex);
for (iter = begin; iter < end; iter++) {
mark_entry = get_tracepoint((*iter)->name);
if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
disable_tracepoint(*iter);
}
}
- mutex_unlock(&tracepoints_mutex);
}
+#ifdef CONFIG_MODULES
+void module_update_tracepoints(void)
+{
+ struct tp_module *tp_mod;
+
+ list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+ tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
+ tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
+}
+#else /* CONFIG_MODULES */
+void module_update_tracepoints(void)
+{
+}
+#endif /* CONFIG_MODULES */
+
+
/*
* Update probes, removing the faulty probes.
+ * Called with tracepoints_mutex held.
*/
static void tracepoint_update_probes(void)
{
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
mutex_lock(&tracepoints_mutex);
old = tracepoint_add_probe(name, probe, data);
- mutex_unlock(&tracepoints_mutex);
- if (IS_ERR(old))
+ if (IS_ERR(old)) {
+ mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
-
+ }
tracepoint_update_probes(); /* may update entry */
+ mutex_unlock(&tracepoints_mutex);
release_probes(old);
return 0;
}
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
mutex_lock(&tracepoints_mutex);
old = tracepoint_remove_probe(name, probe, data);
- mutex_unlock(&tracepoints_mutex);
- if (IS_ERR(old))
+ if (IS_ERR(old)) {
+ mutex_unlock(&tracepoints_mutex);
return PTR_ERR(old);
-
+ }
tracepoint_update_probes(); /* may update entry */
+ mutex_unlock(&tracepoints_mutex);
release_probes(old);
return 0;
}
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
if (!list_empty(&old_probes))
list_replace_init(&old_probes, &release_probes);
need_update = 0;
- mutex_unlock(&tracepoints_mutex);
-
tracepoint_update_probes();
+ mutex_unlock(&tracepoints_mutex);
list_for_each_entry_safe(pos, next, &release_probes, u.list) {
list_del(&pos->u.list);
call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
* Will return the first tracepoint in the range if the input tracepoint is
* NULL.
*/
-int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
struct tracepoint * const *begin, struct tracepoint * const *end)
{
if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
return 1;
return 0;
}
-EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
+#ifdef CONFIG_MODULES
static void tracepoint_get_iter(struct tracepoint_iter *iter)
{
int found = 0;
+ struct tp_module *iter_mod;
/* Core kernel tracepoints */
if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
if (found)
goto end;
}
- /* tracepoints in modules. */
- found = module_get_iter_tracepoints(iter);
+ /* Tracepoints in modules */
+ mutex_lock(&tracepoints_mutex);
+ list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
+ /*
+ * Sorted module list
+ */
+ if (iter_mod < iter->module)
+ continue;
+ else if (iter_mod > iter->module)
+ iter->tracepoint = NULL;
+ found = tracepoint_get_iter_range(&iter->tracepoint,
+ iter_mod->tracepoints_ptrs,
+ iter_mod->tracepoints_ptrs
+ + iter_mod->num_tracepoints);
+ if (found) {
+ iter->module = iter_mod;
+ break;
+ }
+ }
+ mutex_unlock(&tracepoints_mutex);
end:
if (!found)
tracepoint_iter_reset(iter);
}
+#else /* CONFIG_MODULES */
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+ int found = 0;
+
+ /* Core kernel tracepoints */
+ found = tracepoint_get_iter_range(&iter->tracepoint,
+ __start___tracepoints_ptrs,
+ __stop___tracepoints_ptrs);
+ if (!found)
+ tracepoint_iter_reset(iter);
+}
+#endif /* CONFIG_MODULES */
void tracepoint_iter_start(struct tracepoint_iter *iter)
{
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
void tracepoint_iter_reset(struct tracepoint_iter *iter)
{
+#ifdef CONFIG_MODULES
iter->module = NULL;
+#endif /* CONFIG_MODULES */
iter->tracepoint = NULL;
}
EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
#ifdef CONFIG_MODULES
+static int tracepoint_module_coming(struct module *mod)
+{
+ struct tp_module *tp_mod, *iter;
+ int ret = 0;
+
+ /*
+ * We skip modules that tain the kernel, especially those with different
+ * module header (for forced load), to make sure we don't cause a crash.
+ */
+ if (mod->taints)
+ return 0;
+ mutex_lock(&tracepoints_mutex);
+ tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
+ if (!tp_mod) {
+ ret = -ENOMEM;
+ goto end;
+ }
+ tp_mod->num_tracepoints = mod->num_tracepoints;
+ tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+
+ /*
+ * tracepoint_module_list is kept sorted by struct module pointer
+ * address for iteration on tracepoints from a seq_file that can release
+ * the mutex between calls.
+ */
+ list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
+ BUG_ON(iter == tp_mod); /* Should never be in the list twice */
+ if (iter < tp_mod) {
+ /* We belong to the location right after iter. */
+ list_add(&tp_mod->list, &iter->list);
+ goto module_added;
+ }
+ }
+ /* We belong to the beginning of the list */
+ list_add(&tp_mod->list, &tracepoint_module_list);
+module_added:
+ tracepoint_update_probe_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
+end:
+ mutex_unlock(&tracepoints_mutex);
+ return ret;
+}
+
+static int tracepoint_module_going(struct module *mod)
+{
+ struct tp_module *pos;
+
+ mutex_lock(&tracepoints_mutex);
+ tracepoint_update_probe_range(mod->tracepoints_ptrs,
+ mod->tracepoints_ptrs + mod->num_tracepoints);
+ list_for_each_entry(pos, &tracepoint_module_list, list) {
+ if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
+ list_del(&pos->list);
+ kfree(pos);
+ break;
+ }
+ }
+ /*
+ * In the case of modules that were tainted at "coming", we'll simply
+ * walk through the list without finding it. We cannot use the "tainted"
+ * flag on "going", in case a module taints the kernel only after being
+ * loaded.
+ */
+ mutex_unlock(&tracepoints_mutex);
+ return 0;
+}
int tracepoint_module_notify(struct notifier_block *self,
unsigned long val, void *data)
{
struct module *mod = data;
+ int ret = 0;
switch (val) {
case MODULE_STATE_COMING:
+ ret = tracepoint_module_coming(mod);
+ break;
+ case MODULE_STATE_LIVE:
+ break;
case MODULE_STATE_GOING:
- tracepoint_update_probe_range(mod->tracepoints_ptrs,
- mod->tracepoints_ptrs + mod->num_tracepoints);
+ ret = tracepoint_module_going(mod);
break;
}
- return 0;
+ return ret;
}
struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
return register_module_notifier(&tracepoint_module_nb);
}
__initcall(init_tracepoints);
-
#endif /* CONFIG_MODULES */
#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d4..d680381b0e9c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
static int watchdog(void *unused)
{
- static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
-
+ param.sched_priority = 0;
+ sched_setscheduler(current, SCHED_NORMAL, &param);
return 0;
}
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu)
/* create the watchdog thread */
if (!p) {
- p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+ p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
if (IS_ERR(p)) {
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
if (!err) {