diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 2 | ||||
-rw-r--r-- | kernel/cgroup_freezer.c | 116 | ||||
-rw-r--r-- | kernel/cpu.c | 6 | ||||
-rw-r--r-- | kernel/events/core.c | 174 | ||||
-rw-r--r-- | kernel/futex.c | 52 | ||||
-rw-r--r-- | kernel/kexec.c | 8 | ||||
-rw-r--r-- | kernel/locking/rtmutex.c | 32 | ||||
-rw-r--r-- | kernel/sched/core.c | 70 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 37 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 6 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 10 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 2 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 32 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 5 | ||||
-rw-r--r-- | kernel/sched/fair.c | 16 |
15 files changed, 337 insertions, 231 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 11a03d67635a..3f1ca934a237 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -348,7 +348,7 @@ struct cgrp_cset_link { * reference-counted, to improve performance when child cgroups * haven't been created. */ -static struct css_set init_css_set = { +struct css_set init_css_set = { .refcount = ATOMIC_INIT(1), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), .tasks = LIST_HEAD_INIT(init_css_set.tasks), diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2bc4a2256444..345628c78b5b 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -21,6 +21,7 @@ #include <linux/uaccess.h> #include <linux/freezer.h> #include <linux/seq_file.h> +#include <linux/mutex.h> /* * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is @@ -42,9 +43,10 @@ enum freezer_state_flags { struct freezer { struct cgroup_subsys_state css; unsigned int state; - spinlock_t lock; }; +static DEFINE_MUTEX(freezer_mutex); + static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) { return css ? container_of(css, struct freezer, css) : NULL; @@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) if (!freezer) return ERR_PTR(-ENOMEM); - spin_lock_init(&freezer->lock); return &freezer->css; } @@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); - /* - * The following double locking and freezing state inheritance - * guarantee that @cgroup can never escape ancestors' freezing - * states. See css_for_each_descendant_pre() for details. - */ - if (parent) - spin_lock_irq(&parent->lock); - spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); + mutex_lock(&freezer_mutex); freezer->state |= CGROUP_FREEZER_ONLINE; @@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) atomic_inc(&system_freezing_cnt); } - spin_unlock(&freezer->lock); - if (parent) - spin_unlock_irq(&parent->lock); - + mutex_unlock(&freezer_mutex); return 0; } @@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); freezer->state = 0; - spin_unlock_irq(&freezer->lock); + mutex_unlock(&freezer_mutex); } static void freezer_css_free(struct cgroup_subsys_state *css) @@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, struct task_struct *task; bool clear_frozen = false; - spin_lock_irq(&freezer->lock); + mutex_lock(&freezer_mutex); /* * Make the new tasks conform to the current state of @new_css. @@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, } } - spin_unlock_irq(&freezer->lock); - - /* - * Propagate FROZEN clearing upwards. We may race with - * update_if_frozen(), but as long as both work bottom-up, either - * update_if_frozen() sees child's FROZEN cleared or we clear the - * parent's FROZEN later. No parent w/ !FROZEN children can be - * left FROZEN. - */ + /* propagate FROZEN clearing upwards */ while (clear_frozen && (freezer = parent_freezer(freezer))) { - spin_lock_irq(&freezer->lock); freezer->state &= ~CGROUP_FROZEN; clear_frozen = freezer->state & CGROUP_FREEZING; - spin_unlock_irq(&freezer->lock); } + + mutex_unlock(&freezer_mutex); } /** @@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - rcu_read_lock(); - freezer = task_freezer(task); - /* * The root cgroup is non-freezable, so we can skip locking the * freezer. This is safe regardless of race with task migration. @@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task) * to do. If we lost and root is the new cgroup, noop is still the * right thing to do. */ - if (!parent_freezer(freezer)) - goto out; + if (task_css_is_root(task, freezer_cgrp_id)) + return; - /* - * Grab @freezer->lock and freeze @task after verifying @task still - * belongs to @freezer and it's freezing. The former is for the - * case where we have raced against task migration and lost and - * @task is already in a different cgroup which may not be frozen. - * This isn't strictly necessary as freeze_task() is allowed to be - * called spuriously but let's do it anyway for, if nothing else, - * documentation. - */ - spin_lock_irq(&freezer->lock); - if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING)) + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) freeze_task(task); - spin_unlock_irq(&freezer->lock); -out: + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); } /** @@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css) struct css_task_iter it; struct task_struct *task; - WARN_ON_ONCE(!rcu_read_lock_held()); - - spin_lock_irq(&freezer->lock); + lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) - goto out_unlock; + return; /* are all (live) children frozen? */ + rcu_read_lock(); css_for_each_child(pos, css) { struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) - goto out_unlock; + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } } + rcu_read_unlock(); /* are all tasks frozen? */ css_task_iter_start(css, &it); @@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: css_task_iter_end(&it); -out_unlock: - spin_unlock_irq(&freezer->lock); } static int freezer_read(struct seq_file *m, void *v) { struct cgroup_subsys_state *css = seq_css(m), *pos; + mutex_lock(&freezer_mutex); rcu_read_lock(); /* update states bottom-up */ - css_for_each_descendant_post(pos, css) + css_for_each_descendant_post(pos, css) { + if (!css_tryget(pos)) + continue; + rcu_read_unlock(); + update_if_frozen(pos); + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); @@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer->lock); + lockdep_assert_held(&freezer_mutex); if (!(freezer->state & CGROUP_FREEZER_ONLINE)) return; @@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * descendant will try to inherit its parent's FREEZING state as * CGROUP_FREEZING_PARENT. */ + mutex_lock(&freezer_mutex); rcu_read_lock(); css_for_each_descendant_pre(pos, &freezer->css) { struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - spin_lock_irq(&pos_f->lock); + if (!css_tryget(pos)) + continue; + rcu_read_unlock(); - if (pos_f == freezer) { + if (pos_f == freezer) freezer_apply_state(pos_f, freeze, CGROUP_FREEZING_SELF); - } else { - /* - * Our update to @parent->state is already visible - * which is all we need. No need to lock @parent. - * For more info on synchronization, see - * freezer_post_create(). - */ + else freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, CGROUP_FREEZING_PARENT); - } - spin_unlock_irq(&pos_f->lock); + rcu_read_lock(); + css_put(pos); } rcu_read_unlock(); + mutex_unlock(&freezer_mutex); } static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, diff --git a/kernel/cpu.c b/kernel/cpu.c index a9e710eef0e2..247979a1b815 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -726,10 +726,12 @@ void set_cpu_present(unsigned int cpu, bool present) void set_cpu_online(unsigned int cpu, bool online) { - if (online) + if (online) { cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - else + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + } else { cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); + } } void set_cpu_active(unsigned int cpu, bool active) diff --git a/kernel/events/core.c b/kernel/events/core.c index f83a71a3e46d..440eefc67397 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1443,6 +1443,11 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct remove_event { + struct perf_event *event; + bool detach_group; +}; + /* * Cross CPU call to remove a performance event * @@ -1451,12 +1456,15 @@ group_sched_out(struct perf_event *group_event, */ static int __perf_remove_from_context(void *info) { - struct perf_event *event = info; + struct remove_event *re = info; + struct perf_event *event = re->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); + if (re->detach_group) + perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && cpuctx->task_ctx == ctx) { ctx->is_active = 0; @@ -1481,10 +1489,14 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct remove_event re = { + .event = event, + .detach_group = detach_group, + }; lockdep_assert_held(&ctx->mutex); @@ -1493,12 +1505,12 @@ static void perf_remove_from_context(struct perf_event *event) * Per cpu events are removed via an smp call and * the removal is always successful. */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); + cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; } retry: - if (!task_function_call(task, __perf_remove_from_context, event)) + if (!task_function_call(task, __perf_remove_from_context, &re)) return; raw_spin_lock_irq(&ctx->lock); @@ -1515,6 +1527,8 @@ retry: * Since the task isn't running, its safe to remove the event, us * holding the ctx->lock ensures the task won't get scheduled in. */ + if (detach_group) + perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -3178,7 +3192,8 @@ static void free_event_rcu(struct rcu_head *head) } static void ring_buffer_put(struct ring_buffer *rb); -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); static void unaccount_event_cpu(struct perf_event *event, int cpu) { @@ -3238,8 +3253,6 @@ static void free_event(struct perf_event *event) unaccount_event(event); if (event->rb) { - struct ring_buffer *rb; - /* * Can happen when we close an event with re-directed output. * @@ -3247,12 +3260,7 @@ static void free_event(struct perf_event *event) * over us; possibly making our ring_buffer_put() the last. */ mutex_lock(&event->mmap_mutex); - rb = event->rb; - if (rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* could be last */ - } + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); } @@ -3281,10 +3289,7 @@ int perf_event_release_kernel(struct perf_event *event) * to trigger the AB-BA case. */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); + perf_remove_from_context(event, true); mutex_unlock(&ctx->mutex); free_event(event); @@ -3839,28 +3844,47 @@ unlock: static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb) { + struct ring_buffer *old_rb = NULL; unsigned long flags; - if (!list_empty(&event->rb_entry)) - return; + if (event->rb) { + /* + * Should be impossible, we set this when removing + * event->rb_entry and wait/clear when adding event->rb_entry. + */ + WARN_ON_ONCE(event->rcu_pending); - spin_lock_irqsave(&rb->event_lock, flags); - if (list_empty(&event->rb_entry)) - list_add(&event->rb_entry, &rb->event_list); - spin_unlock_irqrestore(&rb->event_lock, flags); -} + old_rb = event->rb; + event->rcu_batches = get_state_synchronize_rcu(); + event->rcu_pending = 1; -static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) -{ - unsigned long flags; + spin_lock_irqsave(&old_rb->event_lock, flags); + list_del_rcu(&event->rb_entry); + spin_unlock_irqrestore(&old_rb->event_lock, flags); + } - if (list_empty(&event->rb_entry)) - return; + if (event->rcu_pending && rb) { + cond_synchronize_rcu(event->rcu_batches); + event->rcu_pending = 0; + } + + if (rb) { + spin_lock_irqsave(&rb->event_lock, flags); + list_add_rcu(&event->rb_entry, &rb->event_list); + spin_unlock_irqrestore(&rb->event_lock, flags); + } + + rcu_assign_pointer(event->rb, rb); - spin_lock_irqsave(&rb->event_lock, flags); - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - spin_unlock_irqrestore(&rb->event_lock, flags); + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } } static void ring_buffer_wakeup(struct perf_event *event) @@ -3929,7 +3953,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - struct ring_buffer *rb = event->rb; + struct ring_buffer *rb = ring_buffer_get(event); struct user_struct *mmap_user = rb->mmap_user; int mmap_locked = rb->mmap_locked; unsigned long size = perf_data_size(rb); @@ -3937,18 +3961,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) - return; + goto out_put; - /* Detach current event from the buffer. */ - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); + ring_buffer_attach(event, NULL); mutex_unlock(&event->mmap_mutex); /* If there's still other mmap()s of this buffer, we're done. */ - if (atomic_read(&rb->mmap_count)) { - ring_buffer_put(rb); /* can't be last */ - return; - } + if (atomic_read(&rb->mmap_count)) + goto out_put; /* * No other mmap()s, detach from all other events that might redirect @@ -3978,11 +3998,9 @@ again: * still restart the iteration to make sure we're not now * iterating the wrong list. */ - if (event->rb == rb) { - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - ring_buffer_put(rb); /* can't be last, we still have one */ - } + if (event->rb == rb) + ring_buffer_attach(event, NULL); + mutex_unlock(&event->mmap_mutex); put_event(event); @@ -4007,6 +4025,7 @@ again: vma->vm_mm->pinned_vm -= mmap_locked; free_uid(mmap_user); +out_put: ring_buffer_put(rb); /* could be last */ } @@ -4124,7 +4143,6 @@ again: vma->vm_mm->pinned_vm += extra; ring_buffer_attach(event, rb); - rcu_assign_pointer(event->rb, rb); perf_event_init_userpage(event); perf_event_update_userpage(event); @@ -5408,6 +5426,9 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; + + /* Keeps track of cpu being initialized/exited */ + bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -5654,8 +5675,14 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (WARN_ON_ONCE(!head)) + if (!head) { + /* + * We can race with cpu hotplug code. Do not + * WARN if the cpu just got unplugged. + */ + WARN_ON_ONCE(swhash->online); return -EINVAL; + } hlist_add_head_rcu(&event->hlist_entry, head); @@ -6914,7 +6941,7 @@ err_size: static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct ring_buffer *rb = NULL, *old_rb = NULL; + struct ring_buffer *rb = NULL; int ret = -EINVAL; if (!output_event) @@ -6942,8 +6969,6 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; - old_rb = event->rb; - if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6951,23 +6976,7 @@ set: goto unlock; } - if (old_rb) - ring_buffer_detach(event, old_rb); - - if (rb) - ring_buffer_attach(event, rb); - - rcu_assign_pointer(event->rb, rb); - - if (old_rb) { - ring_buffer_put(old_rb); - /* - * Since we detached before setting the new rb, so that we - * could attach the new rb, we could have missed a wakeup. - * Provide it now. - */ - wake_up_all(&event->waitq); - } + ring_buffer_attach(event, rb); ret = 0; unlock: @@ -7018,6 +7027,9 @@ SYSCALL_DEFINE5(perf_event_open, if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; + } else { + if (attr.sample_period & (1ULL << 63)) + return -EINVAL; } /* @@ -7165,7 +7177,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); + perf_remove_from_context(group_leader, false); /* * Removing from the context ends up with disabled @@ -7175,7 +7187,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling); + perf_remove_from_context(sibling, false); perf_event__state_init(sibling); put_ctx(gctx); } @@ -7305,7 +7317,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock(&src_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event); + perf_remove_from_context(event, false); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -7367,13 +7379,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); + perf_remove_from_context(child_event, !!child_event->parent); /* * It can happen that the parent exits first, and has events @@ -7724,6 +7730,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) * swapped under us. */ parent_ctx = perf_pin_task_context(parent, ctxn); + if (!parent_ctx) + return 0; /* * No need to check if parent_ctx != NULL here; since we saw @@ -7835,6 +7843,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -7857,14 +7866,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { + struct remove_event re = { .detach_group = false }; struct perf_event_context *ctx = __info; - struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); rcu_read_lock(); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event); + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) + __perf_remove_from_context(&re); rcu_read_unlock(); } @@ -7892,6 +7901,7 @@ static void perf_event_exit_cpu(int cpu) perf_event_exit_cpu_context(cpu); mutex_lock(&swhash->hlist_mutex); + swhash->online = false; swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); } diff --git a/kernel/futex.c b/kernel/futex.c index 5f589279e462..81dbe773ce4c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -745,7 +745,8 @@ void exit_pi_state_list(struct task_struct *curr) static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, - union futex_key *key, struct futex_pi_state **ps) + union futex_key *key, struct futex_pi_state **ps, + struct task_struct *task) { struct futex_pi_state *pi_state = NULL; struct futex_q *this, *next; @@ -786,6 +787,16 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; } + /* + * Protect against a corrupted uval. If uval + * is 0x80000000 then pid is 0 and the waiter + * bit is set. So the deadlock check in the + * calling code has failed and we did not fall + * into the check above due to !pid. + */ + if (task && pi_state->owner == task) + return -EDEADLK; + atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -803,6 +814,11 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!p) return -ESRCH; + if (!p->mm) { + put_task_struct(p); + return -EPERM; + } + /* * We need to look at the task state flags to figure out, * whether the task is exiting. To protect against the do_exit @@ -935,7 +951,7 @@ retry: * We dont have the lock. Look up the PI state (or create it if * we are the first waiter): */ - ret = lookup_pi_state(uval, hb, key, ps); + ret = lookup_pi_state(uval, hb, key, ps, task); if (unlikely(ret)) { switch (ret) { @@ -1347,7 +1363,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * * Return: * 0 - failed to acquire the lock atomically; - * 1 - acquired the lock; + * >0 - acquired the lock, return value is vpid of the top_waiter * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1358,7 +1374,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, { struct futex_q *top_waiter = NULL; u32 curval; - int ret; + int ret, vpid; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -1386,11 +1402,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * the contended case or if set_waiters is 1. The pi_state is returned * in ps in contended cases. */ + vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); - if (ret == 1) + if (ret == 1) { requeue_pi_wake_futex(top_waiter, key2, hb2); - + return vpid; + } return ret; } @@ -1421,7 +1439,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - u32 curval2; if (requeue_pi) { /* @@ -1509,16 +1526,25 @@ retry_private: * At this point the top_waiter has either taken uaddr2 or is * waiting on it. If the former, then the pi_state will not * exist yet, look it up one more time to ensure we have a - * reference to it. + * reference to it. If the lock was taken, ret contains the + * vpid of the top waiter task. */ - if (ret == 1) { + if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; - ret = get_futex_value_locked(&curval2, uaddr2); - if (!ret) - ret = lookup_pi_state(curval2, hb2, &key2, - &pi_state); + /* + * If we acquired the lock, then the user + * space value of uaddr2 should be vpid. It + * cannot be changed by the top waiter as it + * is blocked on hb2 lock if it tries to do + * so. If something fiddled with it behind our + * back the pi state lookup might unearth + * it. So we rather use the known value than + * rereading and handing potential crap to + * lookup_pi_state. + */ + ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL); } switch (ret) { diff --git a/kernel/kexec.c b/kernel/kexec.c index c8380ad203bc..28c57069ef68 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1683,6 +1683,14 @@ int kernel_kexec(void) kexec_in_progress = true; kernel_restart_prepare(NULL); migrate_to_reboot_cpu(); + + /* + * migrate_to_reboot_cpu() disables CPU hotplug assuming that + * no further code needs to use CPU hotplug (which is true in + * the reboot case). However, the kexec path depends on using + * CPU hotplug again; so re-enable it here. + */ + cpu_hotplug_enable(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index aa4dff04b594..a620d4d08ca6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -343,9 +343,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * top_waiter can be NULL, when we are in the deboosting * mode! */ - if (top_waiter && (!task_has_pi_waiters(task) || - top_waiter != task_top_pi_waiter(task))) - goto out_unlock_pi; + if (top_waiter) { + if (!task_has_pi_waiters(task)) + goto out_unlock_pi; + /* + * If deadlock detection is off, we stop here if we + * are not the top pi waiter of the task. + */ + if (!detect_deadlock && top_waiter != task_top_pi_waiter(task)) + goto out_unlock_pi; + } /* * When deadlock detection is off then we check, if further @@ -361,7 +368,12 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto retry; } - /* Deadlock detection */ + /* + * Deadlock detection. If the lock is the same as the original + * lock which caused us to walk the lock chain or if the + * current lock is owned by the task which initiated the chain + * walk, we detected a deadlock. + */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); @@ -527,6 +539,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; + /* + * Early deadlock detection. We really don't want the task to + * enqueue on itself just to untangle the mess later. It's not + * only an optimization. We drop the locks, so another waiter + * can come in before the chain walk detects the deadlock. So + * the other will detect the deadlock and return -EDEADLOCK, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ + if (detect_deadlock && owner == task) + return -EDEADLK; + raw_spin_lock_irqsave(&task->pi_lock, flags); __rt_mutex_adjust_prio(task); waiter->task = task; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d9d8ece46a15..0a7251678982 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2592,8 +2592,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev) if (likely(prev->sched_class == class && rq->nr_running == rq->cfs.h_nr_running)) { p = fair_sched_class.pick_next_task(rq, prev); - if (likely(p && p != RETRY_TASK)) - return p; + if (unlikely(p == RETRY_TASK)) + goto again; + + /* assumes fair_sched_class->next == idle_sched_class */ + if (unlikely(!p)) + p = idle_sched_class.pick_next_task(rq, prev); + + return p; } again: @@ -3124,6 +3130,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr) dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); dl_se->dl_throttled = 0; dl_se->dl_new = 1; + dl_se->dl_yielded = 0; } static void __setscheduler_params(struct task_struct *p, @@ -3188,17 +3195,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr) * We ask for the deadline not being zero, and greater or equal * than the runtime, as well as the period of being zero or * greater than deadline. Furthermore, we have to be sure that - * user parameters are above the internal resolution (1us); we - * check sched_runtime only since it is always the smaller one. + * user parameters are above the internal resolution of 1us (we + * check sched_runtime only since it is always the smaller one) and + * below 2^63 ns (we have to check both sched_deadline and + * sched_period, as the latter can be zero). */ static bool __checkparam_dl(const struct sched_attr *attr) { - return attr && attr->sched_deadline != 0 && - (attr->sched_period == 0 || - (s64)(attr->sched_period - attr->sched_deadline) >= 0) && - (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0 && - attr->sched_runtime >= (2 << (DL_SCALE - 1)); + /* deadline != 0 */ + if (attr->sched_deadline == 0) + return false; + + /* + * Since we truncate DL_SCALE bits, make sure we're at least + * that big. + */ + if (attr->sched_runtime < (1ULL << DL_SCALE)) + return false; + + /* + * Since we use the MSB for wrap-around and sign issues, make + * sure it's not set (mind that period can be equal to zero). + */ + if (attr->sched_deadline & (1ULL << 63) || + attr->sched_period & (1ULL << 63)) + return false; + + /* runtime <= deadline <= period (if period != 0) */ + if ((attr->sched_period != 0 && + attr->sched_period < attr->sched_deadline) || + attr->sched_deadline < attr->sched_runtime) + return false; + + return true; } /* @@ -3639,6 +3669,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) * sys_sched_setattr - same as above, but with extended sched_attr * @pid: the pid in question. * @uattr: structure containing the extended parameters. + * @flags: for future extension. */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) @@ -3650,8 +3681,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (!uattr || pid < 0 || flags) return -EINVAL; - if (sched_copy_attr(uattr, &attr)) - return -EFAULT; + retval = sched_copy_attr(uattr, &attr); + if (retval) + return retval; + + if (attr.sched_policy < 0) + return -EINVAL; rcu_read_lock(); retval = -ESRCH; @@ -3701,7 +3736,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) { - struct sched_param lp; + struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; int retval; @@ -3718,11 +3753,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (retval) goto out_unlock; - if (task_has_dl_policy(p)) { - retval = -EINVAL; - goto out_unlock; - } - lp.sched_priority = p->rt_priority; + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; rcu_read_unlock(); /* @@ -3783,6 +3815,7 @@ err_size: * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @size: sizeof(attr) for fwd/bwd comp. + * @flags: for future extension. */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, size, unsigned int, flags) @@ -5043,7 +5076,6 @@ static int sched_cpu_active(struct notifier_block *nfb, unsigned long action, void *hcpu) { switch (action & ~CPU_TASKS_FROZEN) { - case CPU_STARTING: case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6017,6 +6049,8 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) , .last_balance = jiffies, .balance_interval = sd_weight, + .max_newidle_lb_cost = 0, + .next_decay_max_lb_cost = jiffies, }; SD_INIT_NAME(sd, NUMA); sd->private = &tl->data; diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5b9bb42b2d47..bd95963dae80 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -13,6 +13,7 @@ #include <linux/gfp.h> #include <linux/kernel.h> +#include <linux/slab.h> #include "cpudeadline.h" static inline int parent(int i) @@ -39,8 +40,10 @@ static void cpudl_exchange(struct cpudl *cp, int a, int b) { int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu; - swap(cp->elements[a], cp->elements[b]); - swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]); + swap(cp->elements[a].cpu, cp->elements[b].cpu); + swap(cp->elements[a].dl , cp->elements[b].dl ); + + swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx); } static void cpudl_heapify(struct cpudl *cp, int idx) @@ -140,7 +143,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) WARN_ON(!cpu_present(cpu)); raw_spin_lock_irqsave(&cp->lock, flags); - old_idx = cp->cpu_to_idx[cpu]; + old_idx = cp->elements[cpu].idx; if (!is_valid) { /* remove item */ if (old_idx == IDX_INVALID) { @@ -155,8 +158,8 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl; cp->elements[old_idx].cpu = new_cpu; cp->size--; - cp->cpu_to_idx[new_cpu] = old_idx; - cp->cpu_to_idx[cpu] = IDX_INVALID; + cp->elements[new_cpu].idx = old_idx; + cp->elements[cpu].idx = IDX_INVALID; while (old_idx > 0 && dl_time_before( cp->elements[parent(old_idx)].dl, cp->elements[old_idx].dl)) { @@ -173,7 +176,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) cp->size++; cp->elements[cp->size - 1].dl = 0; cp->elements[cp->size - 1].cpu = cpu; - cp->cpu_to_idx[cpu] = cp->size - 1; + cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); cpumask_clear_cpu(cpu, cp->free_cpus); } else { @@ -195,10 +198,21 @@ int cpudl_init(struct cpudl *cp) memset(cp, 0, sizeof(*cp)); raw_spin_lock_init(&cp->lock); cp->size = 0; - for (i = 0; i < NR_CPUS; i++) - cp->cpu_to_idx[i] = IDX_INVALID; - if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) + + cp->elements = kcalloc(nr_cpu_ids, + sizeof(struct cpudl_item), + GFP_KERNEL); + if (!cp->elements) + return -ENOMEM; + + if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { + kfree(cp->elements); return -ENOMEM; + } + + for_each_possible_cpu(i) + cp->elements[i].idx = IDX_INVALID; + cpumask_setall(cp->free_cpus); return 0; @@ -210,7 +224,6 @@ int cpudl_init(struct cpudl *cp) */ void cpudl_cleanup(struct cpudl *cp) { - /* - * nothing to do for the moment - */ + free_cpumask_var(cp->free_cpus); + kfree(cp->elements); } diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index a202789a412c..538c9796ad4a 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -5,17 +5,17 @@ #define IDX_INVALID -1 -struct array_item { +struct cpudl_item { u64 dl; int cpu; + int idx; }; struct cpudl { raw_spinlock_t lock; int size; - int cpu_to_idx[NR_CPUS]; - struct array_item elements[NR_CPUS]; cpumask_var_t free_cpus; + struct cpudl_item *elements; }; diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8b836b376d91..8834243abee2 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -30,6 +30,7 @@ #include <linux/gfp.h> #include <linux/sched.h> #include <linux/sched/rt.h> +#include <linux/slab.h> #include "cpupri.h" /* Convert between a 140 based task->prio, and our 102 based cpupri */ @@ -70,8 +71,7 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, int idx = 0; int task_pri = convert_prio(p->prio); - if (task_pri >= MAX_RT_PRIO) - return 0; + BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; @@ -219,8 +219,13 @@ int cpupri_init(struct cpupri *cp) goto cleanup; } + cp->cpu_to_pri = kcalloc(nr_cpu_ids, sizeof(int), GFP_KERNEL); + if (!cp->cpu_to_pri) + goto cleanup; + for_each_possible_cpu(i) cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; cleanup: @@ -237,6 +242,7 @@ void cpupri_cleanup(struct cpupri *cp) { int i; + kfree(cp->cpu_to_pri); for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) free_cpumask_var(cp->pri_to_cpu[i].mask); } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index f6d756173491..6b033347fdfd 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -17,7 +17,7 @@ struct cpupri_vec { struct cpupri { struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; - int cpu_to_pri[NR_CPUS]; + int *cpu_to_pri; }; #ifdef CONFIG_SMP diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a95097cb4591..72fdf06ef865 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -332,50 +332,50 @@ out: * softirq as those do not count in task exec_runtime any more. */ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) + struct rq *rq, int ticks) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); + u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; if (steal_account_process_tick()) return; + cputime *= ticks; + scaled *= ticks; + if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_IRQ] += cputime; } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; + cpustat[CPUTIME_SOFTIRQ] += cputime; } else if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SOFTIRQ); + __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); } else if (user_tick) { - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); } else if (p == rq->idle) { - account_idle_time(cputime_one_jiffy); + account_idle_time(cputime); } else if (p->flags & PF_VCPU) { /* System time or guest time */ - account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_guest_time(p, cputime, scaled); } else { - __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, - CPUTIME_SYSTEM); + __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); } } static void irqtime_account_idle_ticks(int ticks) { - int i; struct rq *rq = this_rq(); - for (i = 0; i < ticks; i++) - irqtime_account_process_tick(current, 0, rq); + irqtime_account_process_tick(current, 0, rq, ticks); } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ static inline void irqtime_account_idle_ticks(int ticks) {} static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, - struct rq *rq) {} + struct rq *rq, int nr_ticks) {} #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ /* @@ -464,7 +464,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; if (sched_clock_irqtime) { - irqtime_account_process_tick(p, user_tick, rq); + irqtime_account_process_tick(p, user_tick, rq, 1); return; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b08095786cb8..800e99b99075 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -528,6 +528,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); dl_se->dl_throttled = 0; + dl_se->dl_yielded = 0; if (p->on_rq) { enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (task_has_dl_policy(rq->curr)) @@ -893,10 +894,10 @@ static void yield_task_dl(struct rq *rq) * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it - * new scheduling parameters (thanks to dl_new=1). + * new scheduling parameters (thanks to dl_yielded=1). */ if (p->dl.runtime > 0) { - rq->curr->dl.dl_new = 1; + rq->curr->dl.dl_yielded = 1; p->dl.runtime = 0; } update_curr_dl(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7570dd969c28..0fdb96de81a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6653,6 +6653,7 @@ static int idle_balance(struct rq *this_rq) int this_cpu = this_rq->cpu; idle_enter_fair(this_rq); + /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -6705,14 +6706,16 @@ static int idle_balance(struct rq *this_rq) raw_spin_lock(&this_rq->lock); + if (curr_cost > this_rq->max_idle_balance_cost) + this_rq->max_idle_balance_cost = curr_cost; + /* - * While browsing the domains, we released the rq lock. - * A task could have be enqueued in the meantime + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. */ - if (this_rq->cfs.h_nr_running && !pulled_task) { + if (this_rq->cfs.h_nr_running && !pulled_task) pulled_task = 1; - goto out; - } if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* @@ -6722,9 +6725,6 @@ static int idle_balance(struct rq *this_rq) this_rq->next_balance = next_balance; } - if (curr_cost > this_rq->max_idle_balance_cost) - this_rq->max_idle_balance_cost = curr_cost; - out: /* Is there a task of a high priority class? */ if (this_rq->nr_running != this_rq->cfs.h_nr_running && |