diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/audit.c | 4 | ||||
-rw-r--r-- | kernel/events/core.c | 147 | ||||
-rw-r--r-- | kernel/hrtimer.c | 22 | ||||
-rw-r--r-- | kernel/irq/irqdesc.c | 7 | ||||
-rw-r--r-- | kernel/irq/manage.c | 17 | ||||
-rw-r--r-- | kernel/module.c | 6 | ||||
-rw-r--r-- | kernel/power/suspend.c | 3 | ||||
-rw-r--r-- | kernel/softirq.c | 5 | ||||
-rw-r--r-- | kernel/timer.c | 2 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 27 | ||||
-rw-r--r-- | kernel/trace/trace_events_trigger.c | 2 |
11 files changed, 135 insertions, 107 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index 7c2893602d06..47845c57eb19 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -643,13 +643,13 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) if ((task_active_pid_ns(current) != &init_pid_ns)) return -EPERM; - if (!capable(CAP_AUDIT_CONTROL)) + if (!netlink_capable(skb, CAP_AUDIT_CONTROL)) err = -EPERM; break; case AUDIT_USER: case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: - if (!capable(CAP_AUDIT_WRITE)) + if (!netlink_capable(skb, CAP_AUDIT_WRITE)) err = -EPERM; break; default: /* bad msg */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 5129b1201050..ed50b0943213 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1444,6 +1444,11 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +struct remove_event { + struct perf_event *event; + bool detach_group; +}; + /* * Cross CPU call to remove a performance event * @@ -1452,12 +1457,15 @@ group_sched_out(struct perf_event *group_event, */ static int __perf_remove_from_context(void *info) { - struct perf_event *event = info; + struct remove_event *re = info; + struct perf_event *event = re->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); + if (re->detach_group) + perf_group_detach(event); list_del_event(event, ctx); if (!ctx->nr_events && cpuctx->task_ctx == ctx) { ctx->is_active = 0; @@ -1482,10 +1490,14 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event) +static void perf_remove_from_context(struct perf_event *event, bool detach_group) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; + struct remove_event re = { + .event = event, + .detach_group = detach_group, + }; lockdep_assert_held(&ctx->mutex); @@ -1494,12 +1506,12 @@ static void perf_remove_from_context(struct perf_event *event) * Per cpu events are removed via an smp call and * the removal is always successful. */ - cpu_function_call(event->cpu, __perf_remove_from_context, event); + cpu_function_call(event->cpu, __perf_remove_from_context, &re); return; } retry: - if (!task_function_call(task, __perf_remove_from_context, event)) + if (!task_function_call(task, __perf_remove_from_context, &re)) return; raw_spin_lock_irq(&ctx->lock); @@ -1516,6 +1528,8 @@ retry: * Since the task isn't running, its safe to remove the event, us * holding the ctx->lock ensures the task won't get scheduled in. */ + if (detach_group) + perf_group_detach(event); list_del_event(event, ctx); raw_spin_unlock_irq(&ctx->lock); } @@ -1664,6 +1678,8 @@ event_sched_in(struct perf_event *event, u64 tstamp = perf_event_time(event); int ret = 0; + lockdep_assert_held(&ctx->lock); + if (event->state <= PERF_EVENT_STATE_OFF) return 0; @@ -3235,7 +3251,8 @@ static void __free_event(struct perf_event *event) call_rcu(&event->rcu_head, free_event_rcu); } -static void free_event(struct perf_event *event) + +static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3263,45 +3280,31 @@ static void free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); } -int perf_event_release_kernel(struct perf_event *event) +/* + * Used to free events which have a known refcount of 1, such as in error paths + * where the event isn't exposed yet and inherited events. + */ +static void free_event(struct perf_event *event) { - struct perf_event_context *ctx = event->ctx; - - WARN_ON_ONCE(ctx->parent_ctx); - /* - * There are two ways this annotation is useful: - * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. - * - * 2) there is a lock-inversion with mmap_sem through - * perf_event_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. - */ - mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); - raw_spin_lock_irq(&ctx->lock); - perf_group_detach(event); - raw_spin_unlock_irq(&ctx->lock); - perf_remove_from_context(event); - mutex_unlock(&ctx->mutex); - - free_event(event); + if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1, + "unexpected event refcount: %ld; ptr=%p\n", + atomic_long_read(&event->refcount), event)) { + /* leak to avoid use-after-free */ + return; + } - return 0; + _free_event(event); } -EXPORT_SYMBOL_GPL(perf_event_release_kernel); /* * Called when the last reference to the file is gone. */ static void put_event(struct perf_event *event) { + struct perf_event_context *ctx = event->ctx; struct task_struct *owner; if (!atomic_long_dec_and_test(&event->refcount)) @@ -3340,8 +3343,32 @@ static void put_event(struct perf_event *event) put_task_struct(owner); } - perf_event_release_kernel(event); + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + perf_remove_from_context(event, true); + mutex_unlock(&ctx->mutex); + + _free_event(event); +} + +int perf_event_release_kernel(struct perf_event *event) +{ + put_event(event); + return 0; } +EXPORT_SYMBOL_GPL(perf_event_release_kernel); static int perf_release(struct inode *inode, struct file *file) { @@ -7070,20 +7097,26 @@ SYSCALL_DEFINE5(perf_event_open, } } + if (task && group_leader && + group_leader->attr.inherit != attr.inherit) { + err = -EINVAL; + goto err_task; + } + get_online_cpus(); event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_task; + goto err_cpus; } if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); if (err) { __free_event(event); - goto err_task; + goto err_cpus; } } @@ -7180,7 +7213,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *gctx = group_leader->ctx; mutex_lock(&gctx->mutex); - perf_remove_from_context(group_leader); + perf_remove_from_context(group_leader, false); /* * Removing from the context ends up with disabled @@ -7190,7 +7223,7 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__state_init(group_leader); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling); + perf_remove_from_context(sibling, false); perf_event__state_init(sibling); put_ctx(gctx); } @@ -7245,8 +7278,9 @@ err_context: put_ctx(ctx); err_alloc: free_event(event); -err_task: +err_cpus: put_online_cpus(); +err_task: if (task) put_task_struct(task); err_group_fd: @@ -7320,7 +7354,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock(&src_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event); + perf_remove_from_context(event, false); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -7382,13 +7416,7 @@ __perf_event_exit_task(struct perf_event *child_event, struct perf_event_context *child_ctx, struct task_struct *child) { - if (child_event->parent) { - raw_spin_lock_irq(&child_ctx->lock); - perf_group_detach(child_event); - raw_spin_unlock_irq(&child_ctx->lock); - } - - perf_remove_from_context(child_event); + perf_remove_from_context(child_event, true); /* * It can happen that the parent exits first, and has events @@ -7403,7 +7431,7 @@ __perf_event_exit_task(struct perf_event *child_event, static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *tmp; + struct perf_event *child_event; struct perf_event_context *child_ctx; unsigned long flags; @@ -7457,24 +7485,9 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ mutex_lock(&child_ctx->mutex); -again: - list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, - group_entry) + list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry) __perf_event_exit_task(child_event, child_ctx, child); - list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, - group_entry) - __perf_event_exit_task(child_event, child_ctx, child); - - /* - * If the last event was a group event, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->pinned_groups) || - !list_empty(&child_ctx->flexible_groups)) - goto again; - mutex_unlock(&child_ctx->mutex); put_ctx(child_ctx); @@ -7739,6 +7752,8 @@ int perf_event_init_context(struct task_struct *child, int ctxn) * swapped under us. */ parent_ctx = perf_pin_task_context(parent, ctxn); + if (!parent_ctx) + return 0; /* * No need to check if parent_ctx != NULL here; since we saw @@ -7872,14 +7887,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu) static void __perf_event_exit_context(void *__info) { + struct remove_event re = { .detach_group = false }; struct perf_event_context *ctx = __info; - struct perf_event *event; perf_pmu_rotate_stop(ctx->pmu); rcu_read_lock(); - list_for_each_entry_rcu(event, &ctx->event_list, event_entry) - __perf_remove_from_context(event); + list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) + __perf_remove_from_context(&re); rcu_read_unlock(); } diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 53d26829cd4d..d10eba8089d1 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -234,6 +234,11 @@ again: goto again; } timer->base = new_base; + } else { + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + goto again; + } } return new_base; } @@ -569,6 +574,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) cpu_base->expires_next.tv64 = expires_next.tv64; + /* + * If a hang was detected in the last timer interrupt then we + * leave the hang delay active in the hardware. We want the + * system to make progress. That also prevents the following + * scenario: + * T1 expires 50ms from now + * T2 expires 5s from now + * + * T1 is removed, so this code is called and would reprogram + * the hardware to 5s from now. Any hrtimer_start after that + * will not reprogram the hardware due to hang_detected being + * set. So we'd effectivly block all timers until the T2 event + * fires. + */ + if (cpu_base->hang_detected) + return; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index a7174617616b..bb07f2928f4b 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -363,6 +363,13 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, if (from > irq) return -EINVAL; from = irq; + } else { + /* + * For interrupts which are freely allocated the + * architecture can force a lower bound to the @from + * argument. x86 uses this to exclude the GSI space. + */ + from = arch_dynirq_lower_bound(from); } mutex_lock(&sparse_irq_lock); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 2486a4c1a710..d34131ca372b 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -180,7 +180,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, struct irq_chip *chip = irq_data_get_irq_chip(data); int ret; - ret = chip->irq_set_affinity(data, mask, false); + ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: cpumask_copy(data->affinity, mask); @@ -192,7 +192,8 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, return ret; } -int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) +int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, + bool force) { struct irq_chip *chip = irq_data_get_irq_chip(data); struct irq_desc *desc = irq_data_to_desc(data); @@ -202,7 +203,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return -EINVAL; if (irq_can_move_pcntxt(data)) { - ret = irq_do_set_affinity(data, mask, false); + ret = irq_do_set_affinity(data, mask, force); } else { irqd_set_move_pending(data); irq_copy_pending(desc, mask); @@ -217,13 +218,7 @@ int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) return ret; } -/** - * irq_set_affinity - Set the irq affinity of a given irq - * @irq: Interrupt to set affinity - * @mask: cpumask - * - */ -int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -233,7 +228,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask) return -EINVAL; raw_spin_lock_irqsave(&desc->lock, flags); - ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); + ret = irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask, force); raw_spin_unlock_irqrestore(&desc->lock, flags); return ret; } diff --git a/kernel/module.c b/kernel/module.c index 11869408f79b..079c4615607d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -815,9 +815,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; - if (!(flags & O_NONBLOCK)) - pr_warn("waiting module removal not supported: please upgrade\n"); - if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -3271,6 +3268,9 @@ static int load_module(struct load_info *info, const char __user *uargs, dynamic_debug_setup(info->debug, info->num_debug); + /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ + ftrace_module_init(mod); + /* Finally it's fully formed, ready to start executing. */ err = complete_formation(mod, info); if (err) diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c3ad9cafe930..8233cd4047d7 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/console.h> #include <linux/cpu.h> +#include <linux/cpuidle.h> #include <linux/syscalls.h> #include <linux/gfp.h> #include <linux/io.h> @@ -53,7 +54,9 @@ static void freeze_begin(void) static void freeze_enter(void) { + cpuidle_resume(); wait_event(suspend_freeze_wait_head, suspend_freeze_wake); + cpuidle_pause(); } void freeze_wake(void) diff --git a/kernel/softirq.c b/kernel/softirq.c index b50990a5bea0..33e4648ae0e7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -779,3 +779,8 @@ int __init __weak arch_early_irq_init(void) { return 0; } + +unsigned int __weak arch_dynirq_lower_bound(unsigned int from) +{ + return from; +} diff --git a/kernel/timer.c b/kernel/timer.c index 87bd529879c2..3bb01a323b2a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -838,7 +838,7 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) bit = find_last_bit(&mask, BITS_PER_LONG); - mask = (1 << bit) - 1; + mask = (1UL << bit) - 1; expires_limit = expires_limit & ~(mask); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1fd4b9479210..4a54a25afa2f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4330,16 +4330,11 @@ static void ftrace_init_module(struct module *mod, ftrace_process_locs(mod, start, end); } -static int ftrace_module_notify_enter(struct notifier_block *self, - unsigned long val, void *data) +void ftrace_module_init(struct module *mod) { - struct module *mod = data; - - if (val == MODULE_STATE_COMING) - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); - return 0; + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); } static int ftrace_module_notify_exit(struct notifier_block *self, @@ -4353,11 +4348,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, return 0; } #else -static int ftrace_module_notify_enter(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} static int ftrace_module_notify_exit(struct notifier_block *self, unsigned long val, void *data) { @@ -4365,11 +4355,6 @@ static int ftrace_module_notify_exit(struct notifier_block *self, } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_enter_nb = { - .notifier_call = ftrace_module_notify_enter, - .priority = INT_MAX, /* Run before anything that can use kprobes */ -}; - struct notifier_block ftrace_module_exit_nb = { .notifier_call = ftrace_module_notify_exit, .priority = INT_MIN, /* Run after anything that can remove kprobes */ @@ -4403,10 +4388,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_enter_nb); - if (ret) - pr_warning("Failed to register trace ftrace module enter notifier\n"); - ret = register_module_notifier(&ftrace_module_exit_nb); if (ret) pr_warning("Failed to register trace ftrace module exit notifier\n"); diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 925f537f07d1..4747b476a030 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -77,7 +77,7 @@ event_triggers_call(struct ftrace_event_file *file, void *rec) data->ops->func(data); continue; } - filter = rcu_dereference(data->filter); + filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; if (data->cmd_ops->post_trigger) { |