summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2014-06-06 07:55:06 +0200
committerIngo Molnar <mingo@kernel.org>2014-06-06 07:55:06 +0200
commitec00010972a0971b2c1da4fbe4e5c7d8ed1ecb05 (patch)
treec28975d7daf6d8a3aa23afe8f42837b71105b269 /kernel
parentMerge branch 'uprobes/core' of git://git.kernel.org/pub/scm/linux/kernel/git/... (diff)
parentperf: Fix perf_event_comm() vs. exec() assumption (diff)
downloadlinux-ec00010972a0971b2c1da4fbe4e5c7d8ed1ecb05.tar.xz
linux-ec00010972a0971b2c1da4fbe4e5c7d8ed1ecb05.zip
Merge branch 'perf/urgent' into perf/core, to resolve conflict and to prepare for new patches
Conflicts: arch/x86/kernel/traps.c Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c10
-rw-r--r--kernel/cgroup_freezer.c116
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/events/core.c153
-rw-r--r--kernel/futex.c52
-rw-r--r--kernel/hrtimer.c8
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/rtmutex.c32
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/sched/core.c80
-rw-r--r--kernel/sched/cpudeadline.c37
-rw-r--r--kernel/sched/cpudeadline.h6
-rw-r--r--kernel/sched/cpupri.c10
-rw-r--r--kernel/sched/cpupri.h2
-rw-r--r--kernel/sched/cputime.c32
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/tracepoint.c4
-rw-r--r--kernel/workqueue.c36
23 files changed, 377 insertions, 250 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9fcdaa705b6c..3f1ca934a237 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -348,7 +348,7 @@ struct cgrp_cset_link {
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
-static struct css_set init_css_set = {
+struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
@@ -1495,7 +1495,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
*/
if (!use_task_css_set_links)
cgroup_enable_task_cg_lists();
-retry:
+
mutex_lock(&cgroup_tree_mutex);
mutex_lock(&cgroup_mutex);
@@ -1503,7 +1503,7 @@ retry:
ret = parse_cgroupfs_options(data, &opts);
if (ret)
goto out_unlock;
-
+retry:
/* look for a matching existing root */
if (!opts.subsys_mask && !opts.none && !opts.name) {
cgrp_dfl_root_visible = true;
@@ -1562,9 +1562,9 @@ retry:
if (!atomic_inc_not_zero(&root->cgrp.refcnt)) {
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgroup_tree_mutex);
- kfree(opts.release_agent);
- kfree(opts.name);
msleep(10);
+ mutex_lock(&cgroup_tree_mutex);
+ mutex_lock(&cgroup_mutex);
goto retry;
}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 2bc4a2256444..345628c78b5b 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
+#include <linux/mutex.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
@@ -42,9 +43,10 @@ enum freezer_state_flags {
struct freezer {
struct cgroup_subsys_state css;
unsigned int state;
- spinlock_t lock;
};
+static DEFINE_MUTEX(freezer_mutex);
+
static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct freezer, css) : NULL;
@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
if (!freezer)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&freezer->lock);
return &freezer->css;
}
@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
- /*
- * The following double locking and freezing state inheritance
- * guarantee that @cgroup can never escape ancestors' freezing
- * states. See css_for_each_descendant_pre() for details.
- */
- if (parent)
- spin_lock_irq(&parent->lock);
- spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
+ mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
atomic_inc(&system_freezing_cnt);
}
- spin_unlock(&freezer->lock);
- if (parent)
- spin_unlock_irq(&parent->lock);
-
+ mutex_unlock(&freezer_mutex);
return 0;
}
@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
freezer->state = 0;
- spin_unlock_irq(&freezer->lock);
+ mutex_unlock(&freezer_mutex);
}
static void freezer_css_free(struct cgroup_subsys_state *css)
@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
struct task_struct *task;
bool clear_frozen = false;
- spin_lock_irq(&freezer->lock);
+ mutex_lock(&freezer_mutex);
/*
* Make the new tasks conform to the current state of @new_css.
@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
}
}
- spin_unlock_irq(&freezer->lock);
-
- /*
- * Propagate FROZEN clearing upwards. We may race with
- * update_if_frozen(), but as long as both work bottom-up, either
- * update_if_frozen() sees child's FROZEN cleared or we clear the
- * parent's FROZEN later. No parent w/ !FROZEN children can be
- * left FROZEN.
- */
+ /* propagate FROZEN clearing upwards */
while (clear_frozen && (freezer = parent_freezer(freezer))) {
- spin_lock_irq(&freezer->lock);
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = freezer->state & CGROUP_FREEZING;
- spin_unlock_irq(&freezer->lock);
}
+
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
- rcu_read_lock();
- freezer = task_freezer(task);
-
/*
* The root cgroup is non-freezable, so we can skip locking the
* freezer. This is safe regardless of race with task migration.
@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
* to do. If we lost and root is the new cgroup, noop is still the
* right thing to do.
*/
- if (!parent_freezer(freezer))
- goto out;
+ if (task_css_is_root(task, freezer_cgrp_id))
+ return;
- /*
- * Grab @freezer->lock and freeze @task after verifying @task still
- * belongs to @freezer and it's freezing. The former is for the
- * case where we have raced against task migration and lost and
- * @task is already in a different cgroup which may not be frozen.
- * This isn't strictly necessary as freeze_task() is allowed to be
- * called spuriously but let's do it anyway for, if nothing else,
- * documentation.
- */
- spin_lock_irq(&freezer->lock);
- if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
+ mutex_lock(&freezer_mutex);
+ rcu_read_lock();
+
+ freezer = task_freezer(task);
+ if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
- spin_unlock_irq(&freezer->lock);
-out:
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
/**
@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
struct css_task_iter it;
struct task_struct *task;
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- spin_lock_irq(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZING) ||
(freezer->state & CGROUP_FROZEN))
- goto out_unlock;
+ return;
/* are all (live) children frozen? */
+ rcu_read_lock();
css_for_each_child(pos, css) {
struct freezer *child = css_freezer(pos);
if ((child->state & CGROUP_FREEZER_ONLINE) &&
- !(child->state & CGROUP_FROZEN))
- goto out_unlock;
+ !(child->state & CGROUP_FROZEN)) {
+ rcu_read_unlock();
+ return;
+ }
}
+ rcu_read_unlock();
/* are all tasks frozen? */
css_task_iter_start(css, &it);
@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
freezer->state |= CGROUP_FROZEN;
out_iter_end:
css_task_iter_end(&it);
-out_unlock:
- spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *css = seq_css(m), *pos;
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
/* update states bottom-up */
- css_for_each_descendant_post(pos, css)
+ css_for_each_descendant_post(pos, css) {
+ if (!css_tryget(pos))
+ continue;
+ rcu_read_unlock();
+
update_if_frozen(pos);
+ rcu_read_lock();
+ css_put(pos);
+ }
+
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
unsigned int state)
{
/* also synchronizes against task migration, see freezer_attach() */
- lockdep_assert_held(&freezer->lock);
+ lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZER_ONLINE))
return;
@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
* descendant will try to inherit its parent's FREEZING state as
* CGROUP_FREEZING_PARENT.
*/
+ mutex_lock(&freezer_mutex);
rcu_read_lock();
css_for_each_descendant_pre(pos, &freezer->css) {
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
- spin_lock_irq(&pos_f->lock);
+ if (!css_tryget(pos))
+ continue;
+ rcu_read_unlock();
- if (pos_f == freezer) {
+ if (pos_f == freezer)
freezer_apply_state(pos_f, freeze,
CGROUP_FREEZING_SELF);
- } else {
- /*
- * Our update to @parent->state is already visible
- * which is all we need. No need to lock @parent.
- * For more info on synchronization, see
- * freezer_post_create().
- */
+ else
freezer_apply_state(pos_f,
parent->state & CGROUP_FREEZING,
CGROUP_FREEZING_PARENT);
- }
- spin_unlock_irq(&pos_f->lock);
+ rcu_read_lock();
+ css_put(pos);
}
rcu_read_unlock();
+ mutex_unlock(&freezer_mutex);
}
static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 6cb20d2e7ee0..019d45008448 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
{
enum ctx_state prev_ctx;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a9e710eef0e2..247979a1b815 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present)
void set_cpu_online(unsigned int cpu, bool online)
{
- if (online)
+ if (online) {
cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits));
- else
+ cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits));
+ } else {
cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits));
+ }
}
void set_cpu_active(unsigned int cpu, bool active)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index e9ef0c6646af..8fac2056d51e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2973,6 +2973,22 @@ out:
local_irq_restore(flags);
}
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -3195,7 +3211,8 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
+static void ring_buffer_attach(struct perf_event *event,
+ struct ring_buffer *rb);
static void unaccount_event_cpu(struct perf_event *event, int cpu)
{
@@ -3259,8 +3276,6 @@ static void _free_event(struct perf_event *event)
unaccount_event(event);
if (event->rb) {
- struct ring_buffer *rb;
-
/*
* Can happen when we close an event with re-directed output.
*
@@ -3268,12 +3283,7 @@ static void _free_event(struct perf_event *event)
* over us; possibly making our ring_buffer_put() the last.
*/
mutex_lock(&event->mmap_mutex);
- rb = event->rb;
- if (rb) {
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
- ring_buffer_put(rb); /* could be last */
- }
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
}
@@ -3870,28 +3880,47 @@ unlock:
static void ring_buffer_attach(struct perf_event *event,
struct ring_buffer *rb)
{
+ struct ring_buffer *old_rb = NULL;
unsigned long flags;
- if (!list_empty(&event->rb_entry))
- return;
+ if (event->rb) {
+ /*
+ * Should be impossible, we set this when removing
+ * event->rb_entry and wait/clear when adding event->rb_entry.
+ */
+ WARN_ON_ONCE(event->rcu_pending);
- spin_lock_irqsave(&rb->event_lock, flags);
- if (list_empty(&event->rb_entry))
- list_add(&event->rb_entry, &rb->event_list);
- spin_unlock_irqrestore(&rb->event_lock, flags);
-}
+ old_rb = event->rb;
+ event->rcu_batches = get_state_synchronize_rcu();
+ event->rcu_pending = 1;
-static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
-{
- unsigned long flags;
+ spin_lock_irqsave(&old_rb->event_lock, flags);
+ list_del_rcu(&event->rb_entry);
+ spin_unlock_irqrestore(&old_rb->event_lock, flags);
+ }
- if (list_empty(&event->rb_entry))
- return;
+ if (event->rcu_pending && rb) {
+ cond_synchronize_rcu(event->rcu_batches);
+ event->rcu_pending = 0;
+ }
+
+ if (rb) {
+ spin_lock_irqsave(&rb->event_lock, flags);
+ list_add_rcu(&event->rb_entry, &rb->event_list);
+ spin_unlock_irqrestore(&rb->event_lock, flags);
+ }
+
+ rcu_assign_pointer(event->rb, rb);
- spin_lock_irqsave(&rb->event_lock, flags);
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
}
static void ring_buffer_wakeup(struct perf_event *event)
@@ -3960,7 +3989,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
@@ -3968,18 +3997,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
atomic_dec(&rb->mmap_count);
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
- return;
+ goto out_put;
- /* Detach current event from the buffer. */
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
+ ring_buffer_attach(event, NULL);
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count)) {
- ring_buffer_put(rb); /* can't be last */
- return;
- }
+ if (atomic_read(&rb->mmap_count))
+ goto out_put;
/*
* No other mmap()s, detach from all other events that might redirect
@@ -4009,11 +4034,9 @@ again:
* still restart the iteration to make sure we're not now
* iterating the wrong list.
*/
- if (event->rb == rb) {
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
- ring_buffer_put(rb); /* can't be last, we still have one */
- }
+ if (event->rb == rb)
+ ring_buffer_attach(event, NULL);
+
mutex_unlock(&event->mmap_mutex);
put_event(event);
@@ -4038,6 +4061,7 @@ again:
vma->vm_mm->pinned_vm -= mmap_locked;
free_uid(mmap_user);
+out_put:
ring_buffer_put(rb); /* could be last */
}
@@ -4155,7 +4179,6 @@ again:
vma->vm_mm->pinned_vm += extra;
ring_buffer_attach(event, rb);
- rcu_assign_pointer(event->rb, rb);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
@@ -5070,18 +5093,6 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
void perf_event_comm(struct task_struct *task)
{
struct perf_comm_event comm_event;
- struct perf_event_context *ctx;
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- perf_event_enable_on_exec(ctx);
- }
- rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -5439,6 +5450,9 @@ struct swevent_htable {
/* Recursion avoidance in each contexts */
int recursion[PERF_NR_CONTEXTS];
+
+ /* Keeps track of cpu being initialized/exited */
+ bool online;
};
static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5685,8 +5699,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
hwc->state = !(flags & PERF_EF_START);
head = find_swevent_head(swhash, event);
- if (WARN_ON_ONCE(!head))
+ if (!head) {
+ /*
+ * We can race with cpu hotplug code. Do not
+ * WARN if the cpu just got unplugged.
+ */
+ WARN_ON_ONCE(swhash->online);
return -EINVAL;
+ }
hlist_add_head_rcu(&event->hlist_entry, head);
@@ -6956,7 +6976,7 @@ err_size:
static int
perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
{
- struct ring_buffer *rb = NULL, *old_rb = NULL;
+ struct ring_buffer *rb = NULL;
int ret = -EINVAL;
if (!output_event)
@@ -6984,8 +7004,6 @@ set:
if (atomic_read(&event->mmap_count))
goto unlock;
- old_rb = event->rb;
-
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
@@ -6993,23 +7011,7 @@ set:
goto unlock;
}
- if (old_rb)
- ring_buffer_detach(event, old_rb);
-
- if (rb)
- ring_buffer_attach(event, rb);
-
- rcu_assign_pointer(event->rb, rb);
-
- if (old_rb) {
- ring_buffer_put(old_rb);
- /*
- * Since we detached before setting the new rb, so that we
- * could attach the new rb, we could have missed a wakeup.
- * Provide it now.
- */
- wake_up_all(&event->waitq);
- }
+ ring_buffer_attach(event, rb);
ret = 0;
unlock:
@@ -7060,6 +7062,9 @@ SYSCALL_DEFINE5(perf_event_open,
if (attr.freq) {
if (attr.sample_freq > sysctl_perf_event_sample_rate)
return -EINVAL;
+ } else {
+ if (attr.sample_period & (1ULL << 63))
+ return -EINVAL;
}
/*
@@ -7872,6 +7877,7 @@ static void perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = true;
if (swhash->hlist_refcount > 0) {
struct swevent_hlist *hlist;
@@ -7929,6 +7935,7 @@ static void perf_event_exit_cpu(int cpu)
perf_event_exit_cpu_context(cpu);
mutex_lock(&swhash->hlist_mutex);
+ swhash->online = false;
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
}
diff --git a/kernel/futex.c b/kernel/futex.c
index 5f589279e462..81dbe773ce4c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -745,7 +745,8 @@ void exit_pi_state_list(struct task_struct *curr)
static int
lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+ union futex_key *key, struct futex_pi_state **ps,
+ struct task_struct *task)
{
struct futex_pi_state *pi_state = NULL;
struct futex_q *this, *next;
@@ -786,6 +787,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return -EINVAL;
}
+ /*
+ * Protect against a corrupted uval. If uval
+ * is 0x80000000 then pid is 0 and the waiter
+ * bit is set. So the deadlock check in the
+ * calling code has failed and we did not fall
+ * into the check above due to !pid.
+ */
+ if (task && pi_state->owner == task)
+ return -EDEADLK;
+
atomic_inc(&pi_state->refcount);
*ps = pi_state;
@@ -803,6 +814,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
if (!p)
return -ESRCH;
+ if (!p->mm) {
+ put_task_struct(p);
+ return -EPERM;
+ }
+
/*
* We need to look at the task state flags to figure out,
* whether the task is exiting. To protect against the do_exit
@@ -935,7 +951,7 @@ retry:
* We dont have the lock. Look up the PI state (or create it if
* we are the first waiter):
*/
- ret = lookup_pi_state(uval, hb, key, ps);
+ ret = lookup_pi_state(uval, hb, key, ps, task);
if (unlikely(ret)) {
switch (ret) {
@@ -1347,7 +1363,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
*
* Return:
* 0 - failed to acquire the lock atomically;
- * 1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1358,7 +1374,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret;
+ int ret, vpid;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -1386,11 +1402,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* the contended case or if set_waiters is 1. The pi_state is returned
* in ps in contended cases.
*/
+ vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
set_waiters);
- if (ret == 1)
+ if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+ return vpid;
+ }
return ret;
}
@@ -1421,7 +1439,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
struct futex_pi_state *pi_state = NULL;
struct futex_hash_bucket *hb1, *hb2;
struct futex_q *this, *next;
- u32 curval2;
if (requeue_pi) {
/*
@@ -1509,16 +1526,25 @@ retry_private:
* At this point the top_waiter has either taken uaddr2 or is
* waiting on it. If the former, then the pi_state will not
* exist yet, look it up one more time to ensure we have a
- * reference to it.
+ * reference to it. If the lock was taken, ret contains the
+ * vpid of the top waiter task.
*/
- if (ret == 1) {
+ if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
- ret = get_futex_value_locked(&curval2, uaddr2);
- if (!ret)
- ret = lookup_pi_state(curval2, hb2, &key2,
- &pi_state);
+ /*
+ * If we acquired the lock, then the user
+ * space value of uaddr2 should be vpid. It
+ * cannot be changed by the top waiter as it
+ * is blocked on hb2 lock if it tries to do
+ * so. If something fiddled with it behind our
+ * back the pi state lookup might unearth
+ * it. So we rather use the known value than
+ * rereading and handing potential crap to
+ * lookup_pi_state.
+ */
+ ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
}
switch (ret) {
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d10eba8089d1..3ab28993f6e0 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -990,11 +990,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
ret = remove_hrtimer(timer, base);
- /* Switch the timer base, if necessary: */
- new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
-
if (mode & HRTIMER_MODE_REL) {
- tim = ktime_add_safe(tim, new_base->get_time());
+ tim = ktime_add_safe(tim, base->get_time());
/*
* CONFIG_TIME_LOW_RES is a temporary way for architectures
* to signal that they simply return xtime in
@@ -1009,6 +1006,9 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
+ /* Switch the timer base, if necessary: */
+ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED);
+
timer_stats_hrtimer_set_start_info(timer);
leftmost = enqueue_hrtimer(timer, new_base);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c8380ad203bc..28c57069ef68 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1683,6 +1683,14 @@ int kernel_kexec(void)
kexec_in_progress = true;
kernel_restart_prepare(NULL);
migrate_to_reboot_cpu();
+
+ /*
+ * migrate_to_reboot_cpu() disables CPU hotplug assuming that
+ * no further code needs to use CPU hotplug (which is true in
+ * the reboot case). However, the kexec path depends on using
+ * CPU hotplug again; so re-enable it here.
+ */
+ cpu_hotplug_enable();
printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown();
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index b0e9467922e1..d24e4339b46d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -4188,7 +4188,7 @@ void debug_show_held_locks(struct task_struct *task)
}
EXPORT_SYMBOL_GPL(debug_show_held_locks);
-asmlinkage void lockdep_sys_exit(void)
+asmlinkage __visible void lockdep_sys_exit(void)
{
struct task_struct *curr = current;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index aa4dff04b594..a620d4d08ca6 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* top_waiter can be NULL, when we are in the deboosting
* mode!
*/
- if (top_waiter && (!task_has_pi_waiters(task) ||
- top_waiter != task_top_pi_waiter(task)))
- goto out_unlock_pi;
+ if (top_waiter) {
+ if (!task_has_pi_waiters(task))
+ goto out_unlock_pi;
+ /*
+ * If deadlock detection is off, we stop here if we
+ * are not the top pi waiter of the task.
+ */
+ if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
+ goto out_unlock_pi;
+ }
/*
* When deadlock detection is off then we check, if further
@@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto retry;
}
- /* Deadlock detection */
+ /*
+ * Deadlock detection. If the lock is the same as the original
+ * lock which caused us to walk the lock chain or if the
+ * current lock is owned by the task which initiated the chain
+ * walk, we detected a deadlock.
+ */
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
@@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
unsigned long flags;
int chain_walk = 0, res;
+ /*
+ * Early deadlock detection. We really don't want the task to
+ * enqueue on itself just to untangle the mess later. It's not
+ * only an optimization. We drop the locks, so another waiter
+ * can come in before the chain walk detects the deadlock. So
+ * the other will detect the deadlock and return -EDEADLOCK,
+ * which is wrong, as the other waiter is not in a deadlock
+ * situation.
+ */
+ if (detect_deadlock && owner == task)
+ return -EDEADLK;
+
raw_spin_lock_irqsave(&task->pi_lock, flags);
__rt_mutex_adjust_prio(task);
waiter->task = task;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 18fb7a2fb14b..1ea328aafdc9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1586,7 +1586,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
return -ENOMEM;
}
-asmlinkage int swsusp_save(void)
+asmlinkage __visible int swsusp_save(void)
{
unsigned int nr_pages, nr_highmem;
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index a45b50962295..7228258b85ec 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1674,7 +1674,7 @@ EXPORT_SYMBOL(printk_emit);
*
* See the vsnprintf() documentation for format string extensions over C99.
*/
-asmlinkage int printk(const char *fmt, ...)
+asmlinkage __visible int printk(const char *fmt, ...)
{
va_list args;
int r;
@@ -1737,7 +1737,7 @@ void early_vprintk(const char *fmt, va_list ap)
}
}
-asmlinkage void early_printk(const char *fmt, ...)
+asmlinkage __visible void early_printk(const char *fmt, ...)
{
va_list ap;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 00781cc38047..a8c0fde25e4a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2192,7 +2192,7 @@ static inline void post_schedule(struct rq *rq)
* schedule_tail - first thing a freshly forked thread must call.
* @prev: the thread we just switched away from.
*/
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
struct rq *rq = this_rq();
@@ -2594,8 +2594,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
- if (likely(p && p != RETRY_TASK))
- return p;
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+
+ /* assumes fair_sched_class->next == idle_sched_class */
+ if (unlikely(!p))
+ p = idle_sched_class.pick_next_task(rq, prev);
+
+ return p;
}
again:
@@ -2743,7 +2749,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
blk_schedule_flush_plug(tsk);
}
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;
@@ -2753,7 +2759,7 @@ asmlinkage void __sched schedule(void)
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
{
/*
* If we come here after a random call to set_need_resched(),
@@ -2785,7 +2791,7 @@ void __sched schedule_preempt_disabled(void)
* off of preempt_enable. Kernel preemptions off return from interrupt
* occur there and call schedule directly.
*/
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
{
/*
* If there is a non-zero preempt_count or interrupts are disabled,
@@ -2816,7 +2822,7 @@ EXPORT_SYMBOL(preempt_schedule);
* Note, that this is called and return with irqs disabled. This will
* protect us against recursive calling from irq.
*/
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
@@ -3127,6 +3133,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
+ dl_se->dl_yielded = 0;
}
static void __setscheduler_params(struct task_struct *p,
@@ -3191,17 +3198,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
* We ask for the deadline not being zero, and greater or equal
* than the runtime, as well as the period of being zero or
* greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
- * check sched_runtime only since it is always the smaller one.
+ * user parameters are above the internal resolution of 1us (we
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
*/
static bool
__checkparam_dl(const struct sched_attr *attr)
{
- return attr && attr->sched_deadline != 0 &&
- (attr->sched_period == 0 ||
- (s64)(attr->sched_period - attr->sched_deadline) >= 0) &&
- (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 &&
- attr->sched_runtime >= (2 << (DL_SCALE - 1));
+ /* deadline != 0 */
+ if (attr->sched_deadline == 0)
+ return false;
+
+ /*
+ * Since we truncate DL_SCALE bits, make sure we're at least
+ * that big.
+ */
+ if (attr->sched_runtime < (1ULL << DL_SCALE))
+ return false;
+
+ /*
+ * Since we use the MSB for wrap-around and sign issues, make
+ * sure it's not set (mind that period can be equal to zero).
+ */
+ if (attr->sched_deadline & (1ULL << 63) ||
+ attr->sched_period & (1ULL << 63))
+ return false;
+
+ /* runtime <= deadline <= period (if period != 0) */
+ if ((attr->sched_period != 0 &&
+ attr->sched_period < attr->sched_deadline) ||
+ attr->sched_deadline < attr->sched_runtime)
+ return false;
+
+ return true;
}
/*
@@ -3642,6 +3672,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
* sys_sched_setattr - same as above, but with extended sched_attr
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, flags)
@@ -3653,8 +3684,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
if (!uattr || pid < 0 || flags)
return -EINVAL;
- if (sched_copy_attr(uattr, &attr))
- return -EFAULT;
+ retval = sched_copy_attr(uattr, &attr);
+ if (retval)
+ return retval;
+
+ if (attr.sched_policy < 0)
+ return -EINVAL;
rcu_read_lock();
retval = -ESRCH;
@@ -3704,7 +3739,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
*/
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
{
- struct sched_param lp;
+ struct sched_param lp = { .sched_priority = 0 };
struct task_struct *p;
int retval;
@@ -3721,11 +3756,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
if (retval)
goto out_unlock;
- if (task_has_dl_policy(p)) {
- retval = -EINVAL;
- goto out_unlock;
- }
- lp.sched_priority = p->rt_priority;
+ if (task_has_rt_policy(p))
+ lp.sched_priority = p->rt_priority;
rcu_read_unlock();
/*
@@ -3786,6 +3818,7 @@ err_size:
* @pid: the pid in question.
* @uattr: structure containing the extended parameters.
* @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
*/
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
unsigned int, size, unsigned int, flags)
@@ -5046,7 +5079,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_STARTING:
case CPU_DOWN_FAILED:
set_cpu_active((long)hcpu, true);
return NOTIFY_OK;
@@ -6020,6 +6052,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
,
.last_balance = jiffies,
.balance_interval = sd_weight,
+ .max_newidle_lb_cost = 0,
+ .next_decay_max_lb_cost = jiffies,
};
SD_INIT_NAME(sd, NUMA);
sd->private = &tl->data;
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5b9bb42b2d47..bd95963dae80 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -13,6 +13,7 @@
#include <linux/gfp.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include "cpudeadline.h"
static inline int parent(int i)
@@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b)
{
int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
- swap(cp->elements[a], cp->elements[b]);
- swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+ swap(cp->elements[a].cpu, cp->elements[b].cpu);
+ swap(cp->elements[a].dl , cp->elements[b].dl );
+
+ swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
}
static void cpudl_heapify(struct cpudl *cp, int idx)
@@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
WARN_ON(!cpu_present(cpu));
raw_spin_lock_irqsave(&cp->lock, flags);
- old_idx = cp->cpu_to_idx[cpu];
+ old_idx = cp->elements[cpu].idx;
if (!is_valid) {
/* remove item */
if (old_idx == IDX_INVALID) {
@@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
cp->elements[old_idx].cpu = new_cpu;
cp->size--;
- cp->cpu_to_idx[new_cpu] = old_idx;
- cp->cpu_to_idx[cpu] = IDX_INVALID;
+ cp->elements[new_cpu].idx = old_idx;
+ cp->elements[cpu].idx = IDX_INVALID;
while (old_idx > 0 && dl_time_before(
cp->elements[parent(old_idx)].dl,
cp->elements[old_idx].dl)) {
@@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
cp->size++;
cp->elements[cp->size - 1].dl = 0;
cp->elements[cp->size - 1].cpu = cpu;
- cp->cpu_to_idx[cpu] = cp->size - 1;
+ cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
cpumask_clear_cpu(cpu, cp->free_cpus);
} else {
@@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp)
memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
- for (i = 0; i < NR_CPUS; i++)
- cp->cpu_to_idx[i] = IDX_INVALID;
- if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+
+ cp->elements = kcalloc(nr_cpu_ids,
+ sizeof(struct cpudl_item),
+ GFP_KERNEL);
+ if (!cp->elements)
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
+ kfree(cp->elements);
return -ENOMEM;
+ }
+
+ for_each_possible_cpu(i)
+ cp->elements[i].idx = IDX_INVALID;
+
cpumask_setall(cp->free_cpus);
return 0;
@@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp)
*/
void cpudl_cleanup(struct cpudl *cp)
{
- /*
- * nothing to do for the moment
- */
+ free_cpumask_var(cp->free_cpus);
+ kfree(cp->elements);
}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index a202789a412c..538c9796ad4a 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -5,17 +5,17 @@
#define IDX_INVALID -1
-struct array_item {
+struct cpudl_item {
u64 dl;
int cpu;
+ int idx;
};
struct cpudl {
raw_spinlock_t lock;
int size;
- int cpu_to_idx[NR_CPUS];
- struct array_item elements[NR_CPUS];
cpumask_var_t free_cpus;
+ struct cpudl_item *elements;
};
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 8b836b376d91..8834243abee2 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -30,6 +30,7 @@
#include <linux/gfp.h>
#include <linux/sched.h>
#include <linux/sched/rt.h>
+#include <linux/slab.h>
#include "cpupri.h"
/* Convert between a 140 based task->prio, and our 102 based cpupri */
@@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
int idx = 0;
int task_pri = convert_prio(p->prio);
- if (task_pri >= MAX_RT_PRIO)
- return 0;
+ BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES);
for (idx = 0; idx < task_pri; idx++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
@@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp)
goto cleanup;
}
+ cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL);
+ if (!cp->cpu_to_pri)
+ goto cleanup;
+
for_each_possible_cpu(i)
cp->cpu_to_pri[i] = CPUPRI_INVALID;
+
return 0;
cleanup:
@@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp)
{
int i;
+ kfree(cp->cpu_to_pri);
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++)
free_cpumask_var(cp->pri_to_cpu[i].mask);
}
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..6b033347fdfd 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -17,7 +17,7 @@ struct cpupri_vec {
struct cpupri {
struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
- int cpu_to_pri[NR_CPUS];
+ int *cpu_to_pri;
};
#ifdef CONFIG_SMP
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a95097cb4591..72fdf06ef865 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -332,50 +332,50 @@ out:
* softirq as those do not count in task exec_runtime any more.
*/
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq)
+ struct rq *rq, int ticks)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 cputime = (__force u64) cputime_one_jiffy;
u64 *cpustat = kcpustat_this_cpu->cpustat;
if (steal_account_process_tick())
return;
+ cputime *= ticks;
+ scaled *= ticks;
+
if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_IRQ] += cputime;
} else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
+ cpustat[CPUTIME_SOFTIRQ] += cputime;
} else if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
* Also, p->stime needs to be updated for ksoftirqd.
*/
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SOFTIRQ);
+ __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
} else if (user_tick) {
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
} else if (p == rq->idle) {
- account_idle_time(cputime_one_jiffy);
+ account_idle_time(cputime);
} else if (p->flags & PF_VCPU) { /* System time or guest time */
- account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_guest_time(p, cputime, scaled);
} else {
- __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
- CPUTIME_SYSTEM);
+ __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM);
}
}
static void irqtime_account_idle_ticks(int ticks)
{
- int i;
struct rq *rq = this_rq();
- for (i = 0; i < ticks; i++)
- irqtime_account_process_tick(current, 0, rq);
+ irqtime_account_process_tick(current, 0, rq, ticks);
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
static inline void irqtime_account_idle_ticks(int ticks) {}
static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
- struct rq *rq) {}
+ struct rq *rq, int nr_ticks) {}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
/*
@@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
if (sched_clock_irqtime) {
- irqtime_account_process_tick(p, user_tick, rq);
+ irqtime_account_process_tick(p, user_tick, rq, 1);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b08095786cb8..800e99b99075 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
sched_clock_tick();
update_rq_clock(rq);
dl_se->dl_throttled = 0;
+ dl_se->dl_yielded = 0;
if (p->on_rq) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
if (task_has_dl_policy(rq->curr))
@@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq)
* We make the task go to sleep until its current deadline by
* forcing its runtime to zero. This way, update_curr_dl() stops
* it and the bandwidth timer will wake it up and will give it
- * new scheduling parameters (thanks to dl_new=1).
+ * new scheduling parameters (thanks to dl_yielded=1).
*/
if (p->dl.runtime > 0) {
- rq->curr->dl.dl_new = 1;
+ rq->curr->dl.dl_yielded = 1;
p->dl.runtime = 0;
}
update_curr_dl(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7570dd969c28..0fdb96de81a5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6653,6 +6653,7 @@ static int idle_balance(struct rq *this_rq)
int this_cpu = this_rq->cpu;
idle_enter_fair(this_rq);
+
/*
* We must set idle_stamp _before_ calling idle_balance(), such that we
* measure the duration of idle_balance() as idle time.
@@ -6705,14 +6706,16 @@ static int idle_balance(struct rq *this_rq)
raw_spin_lock(&this_rq->lock);
+ if (curr_cost > this_rq->max_idle_balance_cost)
+ this_rq->max_idle_balance_cost = curr_cost;
+
/*
- * While browsing the domains, we released the rq lock.
- * A task could have be enqueued in the meantime
+ * While browsing the domains, we released the rq lock, a task could
+ * have been enqueued in the meantime. Since we're not going idle,
+ * pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task) {
+ if (this_rq->cfs.h_nr_running && !pulled_task)
pulled_task = 1;
- goto out;
- }
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
/*
@@ -6722,9 +6725,6 @@ static int idle_balance(struct rq *this_rq)
this_rq->next_balance = next_balance;
}
- if (curr_cost > this_rq->max_idle_balance_cost)
- this_rq->max_idle_balance_cost = curr_cost;
-
out:
/* Is there a task of a high priority class? */
if (this_rq->nr_running != this_rq->cfs.h_nr_running &&
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 33e4648ae0e7..92f24f5e8d52 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -223,7 +223,7 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage void __do_softirq(void)
+asmlinkage __visible void __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
@@ -299,7 +299,7 @@ restart:
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
-asmlinkage void do_softirq(void)
+asmlinkage __visible void do_softirq(void)
{
__u32 pending;
unsigned long flags;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index ac5b23cf7212..6620e5837ce2 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -188,7 +188,6 @@ static int tracepoint_add_func(struct tracepoint *tp,
WARN_ON_ONCE(1);
return PTR_ERR(old);
}
- release_probes(old);
/*
* rcu_assign_pointer has a smp_wmb() which makes sure that the new
@@ -200,6 +199,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
rcu_assign_pointer(tp->funcs, tp_funcs);
if (!static_key_enabled(&tp->key))
static_key_slow_inc(&tp->key);
+ release_probes(old);
return 0;
}
@@ -221,7 +221,6 @@ static int tracepoint_remove_func(struct tracepoint *tp,
WARN_ON_ONCE(1);
return PTR_ERR(old);
}
- release_probes(old);
if (!tp_funcs) {
/* Removed last function */
@@ -232,6 +231,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
static_key_slow_dec(&tp->key);
}
rcu_assign_pointer(tp->funcs, tp_funcs);
+ release_probes(old);
return 0;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0ee63af30bd1..8edc87185427 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1916,6 +1916,12 @@ static void send_mayday(struct work_struct *work)
/* mayday mayday mayday */
if (list_empty(&pwq->mayday_node)) {
+ /*
+ * If @pwq is for an unbound wq, its base ref may be put at
+ * any time due to an attribute change. Pin @pwq until the
+ * rescuer is done with it.
+ */
+ get_pwq(pwq);
list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
}
@@ -2398,6 +2404,7 @@ static int rescuer_thread(void *__rescuer)
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
+ bool should_stop;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2409,11 +2416,15 @@ static int rescuer_thread(void *__rescuer)
repeat:
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop()) {
- __set_current_state(TASK_RUNNING);
- rescuer->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
+ /*
+ * By the time the rescuer is requested to stop, the workqueue
+ * shouldn't have any work pending, but @wq->maydays may still have
+ * pwq(s) queued. This can happen by non-rescuer workers consuming
+ * all the work items before the rescuer got to them. Go through
+ * @wq->maydays processing before acting on should_stop so that the
+ * list is always empty on exit.
+ */
+ should_stop = kthread_should_stop();
/* see whether any pwq is asking for help */
spin_lock_irq(&wq_mayday_lock);
@@ -2445,6 +2456,12 @@ repeat:
process_scheduled_works(rescuer);
/*
+ * Put the reference grabbed by send_mayday(). @pool won't
+ * go away while we're holding its lock.
+ */
+ put_pwq(pwq);
+
+ /*
* Leave this pool. If keep_working() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
@@ -2459,6 +2476,12 @@ repeat:
spin_unlock_irq(&wq_mayday_lock);
+ if (should_stop) {
+ __set_current_state(TASK_RUNNING);
+ rescuer->task->flags &= ~PF_WQ_WORKER;
+ return 0;
+ }
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
@@ -4100,7 +4123,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
if (!pwq) {
pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
wq->name);
- goto out_unlock;
+ mutex_lock(&wq->mutex);
+ goto use_dfl_pwq;
}
/*