diff options
Diffstat (limited to 'kernel')
40 files changed, 1304 insertions, 783 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 09c65640cad6..e85bdfd15fed 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1021,8 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len == 0) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); + if (WARN_ON_ONCE(len < 0 || len > MAX_ARG_STRLEN - 1)) { send_sig(SIGKILL, current, 0); return -1; } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f89d9292eee6..b89f3168411b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock); struct percpu_rw_semaphore cgroup_threadgroup_rwsem; #define cgroup_assert_mutex_or_rcu_locked() \ - rcu_lockdep_assert(rcu_read_lock_held() || \ - lockdep_is_held(&cgroup_mutex), \ + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + !lockdep_is_held(&cgroup_mutex), \ "cgroup_mutex or RCU read lock required"); /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 664ce5299334..82cf9dff4295 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -21,6 +21,7 @@ #include <linux/suspend.h> #include <linux/lockdep.h> #include <linux/tick.h> +#include <linux/irq.h> #include <trace/events/power.h> #include "smpboot.h" @@ -190,21 +191,22 @@ void cpu_hotplug_done(void) void cpu_hotplug_disable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 1; + cpu_hotplug_disabled++; cpu_maps_update_done(); } +EXPORT_SYMBOL_GPL(cpu_hotplug_disable); void cpu_hotplug_enable(void) { cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); cpu_maps_update_done(); } - +EXPORT_SYMBOL_GPL(cpu_hotplug_enable); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ -int __ref register_cpu_notifier(struct notifier_block *nb) +int register_cpu_notifier(struct notifier_block *nb) { int ret; cpu_maps_update_begin(); @@ -213,7 +215,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } -int __ref __register_cpu_notifier(struct notifier_block *nb) +int __register_cpu_notifier(struct notifier_block *nb) { return raw_notifier_chain_register(&cpu_chain, nb); } @@ -243,7 +245,7 @@ static void cpu_notify_nofail(unsigned long val, void *v) EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); -void __ref unregister_cpu_notifier(struct notifier_block *nb) +void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); raw_notifier_chain_unregister(&cpu_chain, nb); @@ -251,7 +253,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_cpu_notifier); -void __ref __unregister_cpu_notifier(struct notifier_block *nb) +void __unregister_cpu_notifier(struct notifier_block *nb) { raw_notifier_chain_unregister(&cpu_chain, nb); } @@ -328,7 +330,7 @@ struct take_cpu_down_param { }; /* Take this CPU down. */ -static int __ref take_cpu_down(void *_param) +static int take_cpu_down(void *_param) { struct take_cpu_down_param *param = _param; int err; @@ -347,7 +349,7 @@ static int __ref take_cpu_down(void *_param) } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int _cpu_down(unsigned int cpu, int tasks_frozen) { int err, nr_calls = 0; void *hcpu = (void *)(long)cpu; @@ -380,25 +382,31 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * will observe it. * * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so explicitly call both. + * not imply sync_sched(), so wait for both. * * Do sync before park smpboot threads to take care the rcu boost case. */ -#ifdef CONFIG_PREEMPT - synchronize_sched(); -#endif - synchronize_rcu(); + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_mult(call_rcu, call_rcu_sched); + else + synchronize_rcu(); smpboot_park_threads(cpu); /* - * So now all preempt/rcu users must observe !cpu_active(). + * Prevent irq alloc/free while the dying cpu reorganizes the + * interrupt affinities. */ + irq_lock_sparse(); + /* + * So now all preempt/rcu users must observe !cpu_active(). + */ err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + irq_unlock_sparse(); goto out_release; } BUG_ON(cpu_online(cpu)); @@ -415,6 +423,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ per_cpu(cpu_dead_idle, cpu) = false; + /* Interrupts are moved away from the dying cpu, reenable alloc/free */ + irq_unlock_sparse(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); @@ -432,7 +443,7 @@ out_release: return err; } -int __ref cpu_down(unsigned int cpu) +int cpu_down(unsigned int cpu) { int err; @@ -519,6 +530,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) /* Arch-specific enabling code. */ ret = __cpu_up(cpu, idle); + if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); @@ -597,13 +609,18 @@ int disable_nonboot_cpus(void) } } - if (!error) { + if (!error) BUG_ON(num_online_cpus() > 1); - /* Make sure the CPUs won't be enabled by someone else */ - cpu_hotplug_disabled = 1; - } else { + else pr_err("Non-boot CPUs are not disabled\n"); - } + + /* + * Make sure the CPUs won't be enabled by someone else. We need to do + * this even in case of failure as all disable_nonboot_cpus() users are + * supposed to do enable_nonboot_cpus() on the failure path. + */ + cpu_hotplug_disabled++; + cpu_maps_update_done(); return error; } @@ -616,13 +633,13 @@ void __weak arch_enable_nonboot_cpus_end(void) { } -void __ref enable_nonboot_cpus(void) +void enable_nonboot_cpus(void) { int cpu, error; /* Allow everyone to use the CPU hotplug again */ cpu_maps_update_begin(); - cpu_hotplug_disabled = 0; + WARN_ON(--cpu_hotplug_disabled < 0); if (cpumask_empty(frozen_cpus)) goto out; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ee14e3a35a29..f0acff0f66c9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ - update_nodemasks_hier(cs, &cs->mems_allowed); + update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } diff --git a/kernel/events/core.c b/kernel/events/core.c index e965cfae4207..ae16867670a9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; +static atomic_t nr_switch_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1868,8 +1869,6 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); - event->tstamp_running += tstamp - event->tstamp_stopped; - perf_set_shadow_time(event, ctx, tstamp); perf_log_itrace_start(event); @@ -1881,6 +1880,8 @@ event_sched_in(struct perf_event *event, goto out; } + event->tstamp_running += tstamp - event->tstamp_stopped; + if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) @@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev, local_irq_restore(flags); } +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in); + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task, if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(task, next, false); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); + if (atomic_read(&nr_switch_events)) + perf_event_switch(task, prev, true); + if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3454,6 +3464,10 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_task_events); if (event->attr.freq) atomic_dec(&nr_freq_events); + if (event->attr.context_switch) { + static_key_slow_dec_deferred(&perf_sched_events); + atomic_dec(&nr_switch_events); + } if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -3958,28 +3972,21 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0, active; +struct period_event { + struct perf_event *event; u64 value; +}; - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; +static int __perf_event_period(void *info) +{ + struct period_event *pe = info; + struct perf_event *event = pe->event; + struct perf_event_context *ctx = event->ctx; + u64 value = pe->value; + bool active; - raw_spin_lock_irq(&ctx->lock); + raw_spin_lock(&ctx->lock); if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - event->attr.sample_freq = value; } else { event->attr.sample_period = value; @@ -3998,11 +4005,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } + raw_spin_unlock(&ctx->lock); -unlock: + return 0; +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct period_event pe = { .event = event, }; + struct perf_event_context *ctx = event->ctx; + struct task_struct *task; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + if (event->attr.freq && value > sysctl_perf_event_sample_rate) + return -EINVAL; + + task = ctx->task; + pe.value = value; + + if (!task) { + cpu_function_call(event->cpu, __perf_event_period, &pe); + return 0; + } + +retry: + if (!task_function_call(task, __perf_event_period, &pe)) + return 0; + + raw_spin_lock_irq(&ctx->lock); + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + task = ctx->task; + goto retry; + } + + __perf_event_period(&pe); raw_spin_unlock_irq(&ctx->lock); - return ret; + return 0; } static const struct file_operations perf_fops; @@ -4358,14 +4407,6 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_unlock(); } -static void rb_free_rcu(struct rcu_head *rcu_head) -{ - struct ring_buffer *rb; - - rb = container_of(rcu_head, struct ring_buffer, rcu_head); - rb_free(rb); -} - struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -4748,12 +4789,20 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ +static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) +{ + /* only the parent has fasync state */ + if (event->parent) + event = event->parent; + return &event->fasync; +} + void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); + kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } @@ -5990,6 +6039,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) } /* + * context_switch tracking + */ + +struct perf_switch_event { + struct task_struct *task; + struct task_struct *next_prev; + + struct { + struct perf_event_header header; + u32 next_prev_pid; + u32 next_prev_tid; + } event_id; +}; + +static int perf_event_switch_match(struct perf_event *event) +{ + return event->attr.context_switch; +} + +static void perf_event_switch_output(struct perf_event *event, void *data) +{ + struct perf_switch_event *se = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_switch_match(event)) + return; + + /* Only CPU-wide events are allowed to see next/prev pid/tid */ + if (event->ctx->task) { + se->event_id.header.type = PERF_RECORD_SWITCH; + se->event_id.header.size = sizeof(se->event_id.header); + } else { + se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE; + se->event_id.header.size = sizeof(se->event_id); + se->event_id.next_prev_pid = + perf_event_pid(event, se->next_prev); + se->event_id.next_prev_tid = + perf_event_tid(event, se->next_prev); + } + + perf_event_header__init_id(&se->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, se->event_id.header.size); + if (ret) + return; + + if (event->ctx->task) + perf_output_put(&handle, se->event_id.header); + else + perf_output_put(&handle, se->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_switch(struct task_struct *task, + struct task_struct *next_prev, bool sched_in) +{ + struct perf_switch_event switch_event; + + /* N.B. caller checks nr_switch_events != 0 */ + + switch_event = (struct perf_switch_event){ + .task = task, + .next_prev = next_prev, + .event_id = { + .header = { + /* .type */ + .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT, + /* .size */ + }, + /* .next_prev_pid */ + /* .next_prev_tid */ + }, + }; + + perf_event_aux(perf_event_switch_output, + &switch_event, + NULL); +} + +/* * IRQ throttle logging */ @@ -6048,8 +6182,6 @@ static void perf_log_itrace_start(struct perf_event *event) event->hw.itrace_started) return; - event->hw.itrace_started = 1; - rec.header.type = PERF_RECORD_ITRACE_START; rec.header.misc = 0; rec.header.size = sizeof(rec); @@ -6132,7 +6264,7 @@ static int __perf_event_overflow(struct perf_event *event, else perf_event_output(event, data, regs); - if (event->fasync && event->pending_kill) { + if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending); } @@ -6757,8 +6889,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) if (event->tp_event->prog) return -EEXIST; - if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) - /* bpf programs can only be attached to kprobes */ + if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE)) + /* bpf programs can only be attached to u/kprobes */ return -EINVAL; prog = bpf_prog_get(prog_fd); @@ -7487,6 +7619,10 @@ static void account_event(struct perf_event *event) if (atomic_inc_return(&nr_freq_events) == 1) tick_nohz_full_kick_all(); } + if (event->attr.context_switch) { + atomic_inc(&nr_switch_events); + static_key_slow_inc(&perf_sched_events.key); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2deb24c7a40d..2bbad9c1274c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -11,6 +11,7 @@ struct ring_buffer { atomic_t refcount; struct rcu_head rcu_head; + struct irq_work irq_work; #ifdef CONFIG_PERF_USE_VMALLOC struct work_struct work; int page_order; /* allocation order */ @@ -55,6 +56,15 @@ struct ring_buffer { }; extern void rb_free(struct ring_buffer *rb); + +static inline void rb_free_rcu(struct rcu_head *rcu_head) +{ + struct ring_buffer *rb; + + rb = container_of(rcu_head, struct ring_buffer, rcu_head); + rb_free(rb); +} + extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 96472824a752..182bc30899d5 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -221,6 +221,8 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } +static void rb_irq_work(struct irq_work *work); + static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { @@ -241,6 +243,16 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); + init_irq_work(&rb->irq_work, rb_irq_work); +} + +static void ring_buffer_put_async(struct ring_buffer *rb) +{ + if (!atomic_dec_and_test(&rb->refcount)) + return; + + rb->rcu_head.next = (void *)rb; + irq_work_queue(&rb->irq_work); } /* @@ -319,7 +331,7 @@ err_put: rb_free_aux(rb); err: - ring_buffer_put(rb); + ring_buffer_put_async(rb); handle->event = NULL; return NULL; @@ -370,7 +382,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, local_set(&rb->aux_nest, 0); rb_free_aux(rb); - ring_buffer_put(rb); + ring_buffer_put_async(rb); } /* @@ -425,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order) if (page && order) { /* - * Communicate the allocation size to the driver + * Communicate the allocation size to the driver: + * if we managed to secure a high-order allocation, + * set its first page's private to this order; + * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); @@ -547,17 +562,30 @@ static void __rb_free_aux(struct ring_buffer *rb) rb->aux_priv = NULL; } - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } } void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) + irq_work_queue(&rb->irq_work); +} + +static void rb_irq_work(struct irq_work *work) +{ + struct ring_buffer *rb = container_of(work, struct ring_buffer, irq_work); + + if (!atomic_read(&rb->aux_refcount)) __rb_free_aux(rb); + + if (rb->rcu_head.next == (void *)rb) + call_rcu(&rb->rcu_head, rb_free_rcu); } #ifndef CONFIG_PERF_USE_VMALLOC diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f26a22d..4e5e9798aa0c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -86,15 +86,6 @@ struct uprobe { struct arch_uprobe arch; }; -struct return_instance { - struct uprobe *uprobe; - unsigned long func; - unsigned long orig_ret_vaddr; /* original return address */ - bool chained; /* true, if instance is nested */ - - struct return_instance *next; /* keep as stack */ -}; - /* * Execute out of line area: anonymous executable mapping installed * by the probed task to execute the copy of the original instruction @@ -105,17 +96,18 @@ struct return_instance { * allocated. */ struct xol_area { - wait_queue_head_t wq; /* if all slots are busy */ - atomic_t slot_count; /* number of in-use slots */ - unsigned long *bitmap; /* 0 = free slot */ - struct page *page; + wait_queue_head_t wq; /* if all slots are busy */ + atomic_t slot_count; /* number of in-use slots */ + unsigned long *bitmap; /* 0 = free slot */ + struct vm_special_mapping xol_mapping; + struct page *pages[2]; /* * We keep the vma's vm_start rather than a pointer to the vma * itself. The probed process or a naughty kernel module could make * the vma go away, and we must handle that reasonably gracefully. */ - unsigned long vaddr; /* Page(s) of instruction slots */ + unsigned long vaddr; /* Page(s) of instruction slots */ }; /* @@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); } +static struct uprobe *get_uprobe(struct uprobe *uprobe) +{ + atomic_inc(&uprobe->ref); + return uprobe; +} + +static void put_uprobe(struct uprobe *uprobe) +{ + if (atomic_dec_and_test(&uprobe->ref)) + kfree(uprobe); +} + static int match_uprobe(struct uprobe *l, struct uprobe *r) { if (l->inode < r->inode) @@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) while (n) { uprobe = rb_entry(n, struct uprobe, rb_node); match = match_uprobe(&u, uprobe); - if (!match) { - atomic_inc(&uprobe->ref); - return uprobe; - } + if (!match) + return get_uprobe(uprobe); if (match < 0) n = n->rb_left; @@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) parent = *p; u = rb_entry(parent, struct uprobe, rb_node); match = match_uprobe(uprobe, u); - if (!match) { - atomic_inc(&u->ref); - return u; - } + if (!match) + return get_uprobe(u); if (match < 0) p = &parent->rb_left; @@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe) return u; } -static void put_uprobe(struct uprobe *uprobe) -{ - if (atomic_dec_and_test(&uprobe->ref)) - kfree(uprobe); -} - static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) { struct uprobe *uprobe, *cur_uprobe; @@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode, if (u->inode != inode || u->offset < min) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } for (t = n; (t = rb_next(t)); ) { u = rb_entry(t, struct uprobe, rb_node); if (u->inode != inode || u->offset > max) break; list_add(&u->pending_list, head); - atomic_inc(&u->ref); + get_uprobe(u); } } spin_unlock(&uprobes_treelock); @@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon /* Slot allocation for XOL */ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) { - int ret = -EALREADY; + struct vm_area_struct *vma; + int ret; down_write(&mm->mmap_sem); - if (mm->uprobes_state.xol_area) + if (mm->uprobes_state.xol_area) { + ret = -EALREADY; goto fail; + } if (!area->vaddr) { /* Try to map as high as possible, this is only a hint. */ @@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } } - ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, - VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); - if (ret) + vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE, + VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, + &area->xol_mapping); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto fail; + } + ret = 0; smp_wmb(); /* pairs with get_xol_area() */ mm->uprobes_state.xol_area = area; fail: @@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) if (!area->bitmap) goto free_area; - area->page = alloc_page(GFP_HIGHUSER); - if (!area->page) + area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.pages = area->pages; + area->pages[0] = alloc_page(GFP_HIGHUSER); + if (!area->pages[0]) goto free_bitmap; + area->pages[1] = NULL; area->vaddr = vaddr; init_waitqueue_head(&area->wq); /* Reserve the 1st slot for get_trampoline_vaddr() */ set_bit(0, area->bitmap); atomic_set(&area->slot_count, 1); - copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); + copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE); if (!xol_add_vma(mm, area)) return area; - __free_page(area->page); + __free_page(area->pages[0]); free_bitmap: kfree(area->bitmap); free_area: @@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm) if (!area) return; - put_page(area->page); + put_page(area->pages[0]); kfree(area->bitmap); kfree(area); } @@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe) if (unlikely(!xol_vaddr)) return 0; - arch_uprobe_copy_ixol(area->page, xol_vaddr, + arch_uprobe_copy_ixol(area->pages[0], xol_vaddr, &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); return xol_vaddr; @@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk) clear_bit(slot_nr, area->bitmap); atomic_dec(&area->slot_count); + smp_mb__after_atomic(); /* pairs with prepare_to_wait() */ if (waitqueue_active(&area->wq)) wake_up(&area->wq); @@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) return instruction_pointer(regs); } +static struct return_instance *free_ret_instance(struct return_instance *ri) +{ + struct return_instance *next = ri->next; + put_uprobe(ri->uprobe); + kfree(ri); + return next; +} + /* * Called with no locks held. * Called in context of a exiting or a exec-ing thread. @@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs) void uprobe_free_utask(struct task_struct *t) { struct uprobe_task *utask = t->utask; - struct return_instance *ri, *tmp; + struct return_instance *ri; if (!utask) return; @@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t) put_uprobe(utask->active_uprobe); ri = utask->return_instances; - while (ri) { - tmp = ri; - ri = ri->next; - - put_uprobe(tmp->uprobe); - kfree(tmp); - } + while (ri) + ri = free_ret_instance(ri); xol_free_insn_slot(t); kfree(utask); @@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask) return -ENOMEM; *n = *o; - atomic_inc(&n->uprobe->ref); + get_uprobe(n->uprobe); n->next = NULL; *p = n; @@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void) return trampoline_vaddr; } +static void cleanup_return_instances(struct uprobe_task *utask, bool chained, + struct pt_regs *regs) +{ + struct return_instance *ri = utask->return_instances; + enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL; + + while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) { + ri = free_ret_instance(ri); + utask->depth--; + } + utask->return_instances = ri; +} + static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) { struct return_instance *ri; struct uprobe_task *utask; unsigned long orig_ret_vaddr, trampoline_vaddr; - bool chained = false; + bool chained; if (!get_xol_area()) return; @@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) return; } - ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); + ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL); if (!ri) - goto fail; + return; trampoline_vaddr = get_trampoline_vaddr(); orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); if (orig_ret_vaddr == -1) goto fail; + /* drop the entries invalidated by longjmp() */ + chained = (orig_ret_vaddr == trampoline_vaddr); + cleanup_return_instances(utask, chained, regs); + /* * We don't want to keep trampoline address in stack, rather keep the * original return address of first caller thru all the consequent * instances. This also makes breakpoint unwrapping easier. */ - if (orig_ret_vaddr == trampoline_vaddr) { + if (chained) { if (!utask->return_instances) { /* * This situation is not possible. Likely we have an * attack from user-space. */ - pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); + uprobe_warn(current, "handle tail call"); goto fail; } - - chained = true; orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; } - atomic_inc(&uprobe->ref); - ri->uprobe = uprobe; + ri->uprobe = get_uprobe(uprobe); ri->func = instruction_pointer(regs); + ri->stack = user_stack_pointer(regs); ri->orig_ret_vaddr = orig_ret_vaddr; ri->chained = chained; utask->depth++; - - /* add instance to the stack */ ri->next = utask->return_instances; utask->return_instances = ri; return; - fail: kfree(ri); } @@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs) up_read(&uprobe->register_rwsem); } -static bool handle_trampoline(struct pt_regs *regs) +static struct return_instance *find_next_ret_chain(struct return_instance *ri) { - struct uprobe_task *utask; - struct return_instance *ri, *tmp; bool chained; + do { + chained = ri->chained; + ri = ri->next; /* can't be NULL if chained */ + } while (chained); + + return ri; +} + +static void handle_trampoline(struct pt_regs *regs) +{ + struct uprobe_task *utask; + struct return_instance *ri, *next; + bool valid; + utask = current->utask; if (!utask) - return false; + goto sigill; ri = utask->return_instances; if (!ri) - return false; - - /* - * TODO: we should throw out return_instance's invalidated by - * longjmp(), currently we assume that the probed function always - * returns. - */ - instruction_pointer_set(regs, ri->orig_ret_vaddr); - - for (;;) { - handle_uretprobe_chain(ri, regs); - - chained = ri->chained; - put_uprobe(ri->uprobe); - - tmp = ri; - ri = ri->next; - kfree(tmp); - utask->depth--; + goto sigill; - if (!chained) - break; - BUG_ON(!ri); - } + do { + /* + * We should throw out the frames invalidated by longjmp(). + * If this chain is valid, then the next one should be alive + * or NULL; the latter case means that nobody but ri->func + * could hit this trampoline on return. TODO: sigaltstack(). + */ + next = find_next_ret_chain(ri); + valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs); + + instruction_pointer_set(regs, ri->orig_ret_vaddr); + do { + if (valid) + handle_uretprobe_chain(ri, regs); + ri = free_ret_instance(ri); + utask->depth--; + } while (ri != next); + } while (!valid); utask->return_instances = ri; + return; + + sigill: + uprobe_warn(current, "handle uretprobe, sending SIGILL."); + force_sig_info(SIGILL, SEND_SIG_FORCED, current); - return true; } bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) @@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) return false; } +bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + return true; +} + /* * Run handler and ask thread to singlestep. * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. @@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs) int uninitialized_var(is_swbp); bp_vaddr = uprobe_get_swbp_addr(regs); - if (bp_vaddr == get_trampoline_vaddr()) { - if (handle_trampoline(regs)) - return; - - pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n", - current->pid, current->tgid); - } + if (bp_vaddr == get_trampoline_vaddr()) + return handle_trampoline(regs); uprobe = find_active_uprobe(bp_vaddr, &is_swbp); if (!uprobe) { diff --git a/kernel/fork.c b/kernel/fork.c index 6e8f807c5716..0d93b4d0617b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested) max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +/* Initialized by the architecture: */ +int arch_task_struct_size __read_mostly; +#endif + void __init fork_init(void) { #ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR @@ -295,7 +300,7 @@ void __init fork_init(void) #endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = - kmem_cache_create("task_struct", sizeof(struct task_struct), + kmem_cache_create("task_struct", arch_task_struct_size, ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); #endif diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 27f4332c7f84..ae216824e8ca 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -985,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data, } /** + * irq_chip_set_type_parent - Set IRQ type on the parent interrupt + * @data: Pointer to interrupt specific data + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) +{ + data = data->parent_data; + + if (data->chip->irq_set_type) + return data->chip->irq_set_type(data, type); + + return -ENOSYS; +} + +/** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware * @data: Pointer to interrupt specific data * @@ -997,7 +1014,7 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) if (data->chip && data->chip->irq_retrigger) return data->chip->irq_retrigger(data); - return -ENOSYS; + return 0; } /** diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 4834ee828c41..61008b8433ab 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,12 +76,8 @@ extern void unmask_threaded_irq(struct irq_desc *desc); #ifdef CONFIG_SPARSE_IRQ static inline void irq_mark_irq(unsigned int irq) { } -extern void irq_lock_sparse(void); -extern void irq_unlock_sparse(void); #else extern void irq_mark_irq(unsigned int irq); -static inline void irq_lock_sparse(void) { } -static inline void irq_unlock_sparse(void) { } #endif extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 9065107f083e..7a5237a1bce5 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -75,13 +75,21 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* - * If the interrupt has a parent irq and runs - * in the thread context of the parent irq, - * retrigger the parent. + * If the interrupt is running in the thread + * context of the parent irq we need to be + * careful, because we cannot trigger it + * directly. */ - if (desc->parent_irq && - irq_settings_is_nested_thread(desc)) + if (irq_settings_is_nested_thread(desc)) { + /* + * If the parent_irq is valid, we + * retrigger the parent, otherwise we + * do nothing. + */ + if (!desc->parent_irq) + return; irq = desc->parent_irq; + } /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); tasklet_schedule(&resend_tasklet); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c90e417bb963..d10ab6b9b5e0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr) addr < (unsigned long)__kprobes_text_end; } -static bool within_kprobe_blacklist(unsigned long addr) +bool within_kprobe_blacklist(unsigned long addr) { struct kprobe_blacklist_entry *ent; diff --git a/kernel/kthread.c b/kernel/kthread.c index 7c40a189becc..490924cc9e7c 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -97,6 +97,7 @@ bool kthread_should_park(void) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); } +EXPORT_SYMBOL_GPL(kthread_should_park); /** * kthread_freezable_should_stop - should this freezable kthread return now? @@ -171,6 +172,7 @@ void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } +EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { @@ -425,6 +427,7 @@ void kthread_unpark(struct task_struct *k) if (kthread) __kthread_unpark(k, kthread); } +EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). @@ -455,6 +458,7 @@ int kthread_park(struct task_struct *k) } return ret; } +EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 04ab18151cc8..df19ae4debd0 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -4,6 +4,7 @@ #include <linux/hash.h> #include <linux/bootmem.h> +#include <linux/debug_locks.h> /* * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead @@ -286,15 +287,23 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) { struct __qspinlock *l = (void *)lock; struct pv_node *node; + u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); /* * We must not unlock if SLOW, because in that case we must first * unhash. Otherwise it would be possible to have multiple @lock * entries, which would be BAD. */ - if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) + if (likely(lockval == _Q_LOCKED_VAL)) return; + if (unlikely(lockval != _Q_SLOW_VAL)) { + if (debug_locks_silent) + return; + WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val)); + return; + } + /* * Since the above failed to release, this must be the SLOW path. * Therefore start by looking up the blocked node and unhashing it. diff --git a/kernel/module.c b/kernel/module.c index 3e0e19763d24..b86b7bf1be38 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -602,13 +602,16 @@ const struct kernel_symbol *find_symbol(const char *name, } EXPORT_SYMBOL_GPL(find_symbol); -/* Search for module by name: must hold module_mutex. */ +/* + * Search for module by name: must hold module_mutex (or preempt disabled + * for read-only access). + */ static struct module *find_module_all(const char *name, size_t len, bool even_unformed) { struct module *mod; - module_assert_mutex(); + module_assert_mutex_or_preempt(); list_for_each_entry(mod, &modules, list) { if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) @@ -621,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len, struct module *find_module(const char *name) { + module_assert_mutex(); return find_module_all(name, strlen(name), false); } EXPORT_SYMBOL_GPL(find_module); @@ -3557,6 +3561,7 @@ static int load_module(struct load_info *info, const char __user *uargs, mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + mod_tree_remove(mod); wake_up_all(&module_wq); /* Wait for RCU-sched synchronizing before releasing mod->list. */ synchronize_sched(); diff --git a/kernel/pid.c b/kernel/pid.c index 4fd07d5b7baf..ca368793808e 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task); */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { - rcu_lockdep_assert(rcu_read_lock_held(), - "find_task_by_pid_ns() needs rcu_read_lock()" - " protection"); + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), + "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 59e32684c23b..77192953dee5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, .exp_sync = synchronize_sched_expedited, + .get_state = get_state_synchronize_sched, + .cond_sync = cond_synchronize_sched, .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, @@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = { #define RCUTORTURE_TASKS_OPS &tasks_ops, +static bool __maybe_unused torturing_tasks(void) +{ + return cur_ops == &tasks_ops; +} + #else /* #ifdef CONFIG_TASKS_RCU */ #define RCUTORTURE_TASKS_OPS +static bool torturing_tasks(void) +{ + return false; +} + #endif /* #else #ifdef CONFIG_TASKS_RCU */ /* @@ -823,9 +835,7 @@ rcu_torture_cbflood(void *arg) } if (err) { VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); - while (!torture_must_stop()) - schedule_timeout_interruptible(HZ); - return 0; + goto wait_for_stop; } VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); do { @@ -844,6 +854,7 @@ rcu_torture_cbflood(void *arg) stutter_wait("rcu_torture_cbflood"); } while (!torture_must_stop()); vfree(rhp); +wait_for_stop: torture_kthread_stopping("rcu_torture_cbflood"); return 0; } @@ -1088,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -1162,7 +1174,8 @@ rcu_torture_reader(void *arg) p = rcu_dereference_check(rcu_torture_current, rcu_read_lock_bh_held() || rcu_read_lock_sched_held() || - srcu_read_lock_held(srcu_ctlp)); + srcu_read_lock_held(srcu_ctlp) || + torturing_tasks()); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1507,7 +1520,7 @@ static int rcu_torture_barrier_init(void) int i; int ret; - if (n_barrier_cbs == 0) + if (n_barrier_cbs <= 0) return 0; if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { pr_alert("%s" TORTURE_FLAG @@ -1786,12 +1799,15 @@ rcu_torture_init(void) writer_task); if (firsterr) goto unwind; - fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), - GFP_KERNEL); - if (fakewriter_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nfakewriters > 0) { + fakewriter_tasks = kzalloc(nfakewriters * + sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } for (i = 0; i < nfakewriters; i++) { firsterr = torture_create_kthread(rcu_torture_fakewriter, @@ -1818,7 +1834,7 @@ rcu_torture_init(void) if (firsterr) goto unwind; } - if (test_no_idle_hz) { + if (test_no_idle_hz && shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval * HZ); if (firsterr) goto unwind; diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index fb33d35ee0b7..d3fcb2ec8536 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) } /** - * srcu_readers_active - returns approximate number of readers. + * srcu_readers_active - returns true if there are readers. and false + * otherwise * @sp: which srcu_struct to count active readers (holding srcu_read_lock). * * Note that this is not an atomic primitive, and can therefore suffer * severe errors when invoked on an active srcu_struct. That said, it * can be useful as an error check at cleanup time. */ -static int srcu_readers_active(struct srcu_struct *sp) +static bool srcu_readers_active(struct srcu_struct *sp) { int cpu; unsigned long sum = 0; @@ -414,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) struct rcu_head *head = &rcu.head; bool done = false; - rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && - !lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); might_sleep(); init_completion(&rcu.completion); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c291bd65d2cb..d0471056d0af 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU read-side critical section"); cond_resched(); } EXPORT_SYMBOL_GPL(synchronize_sched); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 65137bc28b2b..9f75f25cc5d9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree"); static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; +static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS]; /* * In order to export the rcu_state name to the tracing tools, it @@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444); static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; -static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ - NUM_RCU_LVL_0, - NUM_RCU_LVL_1, - NUM_RCU_LVL_2, - NUM_RCU_LVL_3, - NUM_RCU_LVL_4, -}; +/* Number of rcu_nodes at specified level. */ +static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* @@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user) * It is illegal to enter an extended quiescent state while * in an RCU read-side critical section. */ - rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), + "Illegal idle entry in RCU read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), + "Illegal idle entry in RCU-bh read-side critical section."); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), + "Illegal idle entry in RCU-sched read-side critical section."); } /* @@ -701,7 +698,7 @@ void rcu_idle_enter(void) } EXPORT_SYMBOL_GPL(rcu_idle_enter); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -714,7 +711,7 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle @@ -828,7 +825,7 @@ void rcu_idle_exit(void) } EXPORT_SYMBOL_GPL(rcu_idle_exit); -#ifdef CONFIG_RCU_USER_QS +#ifdef CONFIG_NO_HZ_FULL /** * rcu_user_exit - inform RCU that we are exiting userspace. * @@ -839,7 +836,7 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } -#endif /* CONFIG_RCU_USER_QS */ +#endif /* CONFIG_NO_HZ_FULL */ /** * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle @@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void) { bool ret; - preempt_disable(); + preempt_disable_notrace(); ret = __rcu_is_watching(); - preempt_enable(); + preempt_enable_notrace(); return ret; } EXPORT_SYMBOL_GPL(rcu_is_watching); @@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", rsp->name, j - gpa, - rsp->gpnum, rsp->completed, rsp->gp_flags); + rsp->gpnum, rsp->completed, + rsp->gp_flags, rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : 0); } /* @@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp) } /* + * Helper function for wait_event_interruptible_timeout() wakeup + * at force-quiescent-state time. + */ +static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Someone like call_rcu() requested a force-quiescent-state scan. */ + *gfp = READ_ONCE(rsp->gp_flags); + if (*gfp & RCU_GP_FLAG_FQS) + return true; + + /* The current grace period has completed. */ + if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) + return true; + + return false; +} + +/* * Do one round of quiescent-state forcing. */ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) @@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg) wait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); + rsp->gp_state = RCU_GP_DONE_GPS; /* Locking provides needed memory barrier. */ if (rcu_gp_init(rsp)) break; @@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg) TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; ret = wait_event_interruptible_timeout(rsp->gp_wq, - ((gf = READ_ONCE(rsp->gp_flags)) & - RCU_GP_FLAG_FQS) || - (!READ_ONCE(rnp->qsmask) && - !rcu_preempt_blocked_readers_cgp(rnp)), - j); + rcu_gp_fqs_check_wake(rsp, &gf), j); + rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg) } /* Handle grace-period end. */ + rsp->gp_state = RCU_GP_CLEANUP; rcu_gp_cleanup(rsp); + rsp->gp_state = RCU_GP_CLEANED; } } @@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void) */ void synchronize_sched(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_sched() in RCU-sched read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched); */ void synchronize_rcu_bh(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; if (rcu_gp_is_expedited()) @@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_rcu); -static int synchronize_sched_expedited_cpu_stop(void *data) +/** + * get_state_synchronize_sched - Snapshot current RCU-sched state + * + * Returns a cookie that is used by a later call to cond_synchronize_sched() + * to determine whether or not a full grace period has elapsed in the + * meantime. + */ +unsigned long get_state_synchronize_sched(void) { /* - * There must be a full memory barrier on each affected CPU - * between the time that try_stop_cpus() is called and the - * time that it returns. - * - * In the current initial implementation of cpu_stop, the - * above condition is already met when the control reaches - * this point and the following smp_mb() is not strictly - * necessary. Do smp_mb() anyway for documentation and - * robustness against future implementation changes. + * Any prior manipulation of RCU-protected data must happen + * before the load from ->gpnum. + */ + smp_mb(); /* ^^^ */ + + /* + * Make sure this load happens before the purportedly + * time-consuming work between get_state_synchronize_sched() + * and cond_synchronize_sched(). + */ + return smp_load_acquire(&rcu_sched_state.gpnum); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_sched); + +/** + * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period + * + * @oldstate: return value from earlier call to get_state_synchronize_sched() + * + * If a full RCU-sched grace period has elapsed since the earlier call to + * get_state_synchronize_sched(), just return. Otherwise, invoke + * synchronize_sched() to wait for a full grace period. + * + * Yes, this function does not take counter wrap into account. But + * counter wrap is harmless. If the counter wraps, we have waited for + * more than 2 billion grace periods (and way more on a 64-bit system!), + * so waiting for one additional grace period should be just fine. + */ +void cond_synchronize_sched(unsigned long oldstate) +{ + unsigned long newstate; + + /* + * Ensure that this load happens before any RCU-destructive + * actions the caller might carry out after we return. */ - smp_mb(); /* See above comment block. */ + newstate = smp_load_acquire(&rcu_sched_state.completed); + if (ULONG_CMP_GE(oldstate, newstate)) + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(cond_synchronize_sched); + +/* Adjust sequence number for start of update-side operation. */ +static void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(!(*sp & 0x1)); +} + +/* Adjust sequence number for end of update-side operation. */ +static void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WRITE_ONCE(*sp, *sp + 1); + WARN_ON_ONCE(*sp & 0x1); +} + +/* Take a snapshot of the update side's sequence number. */ +static unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = (READ_ONCE(*sp) + 3) & ~0x1; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + return rcu_seq_snap(&rsp->expedited_sequence); +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, + struct rcu_data *rdp, + atomic_long_t *stat, unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + if (rnp) + mutex_unlock(&rnp->exp_funnel_mutex); + else if (rdp) + mutex_unlock(&rdp->exp_funnel_mutex); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns a + * pointer to the root rcu_node structure, or NULL if some other + * task did the expedited grace period for us. + */ +static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp; + struct rcu_node *rnp0; + struct rcu_node *rnp1 = NULL; + + /* + * First try directly acquiring the root lock in order to reduce + * latency in the common case where expedited grace periods are + * rare. We check mutex_is_locked() to avoid pathological levels of + * memory contention on ->exp_funnel_mutex in the heavy-load case. + */ + rnp0 = rcu_get_root(rsp); + if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { + if (mutex_trylock(&rnp0->exp_funnel_mutex)) { + if (sync_exp_work_done(rsp, rnp0, NULL, + &rsp->expedited_workdone0, s)) + return NULL; + return rnp0; + } + } + + /* + * Each pass through the following loop works its way + * up the rcu_node tree, returning if others have done the + * work or otherwise falls through holding the root rnp's + * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure + * can be inexact, as it is just promoting locality and is not + * strictly needed for correctness. + */ + rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + return NULL; + mutex_lock(&rdp->exp_funnel_mutex); + rnp0 = rdp->mynode; + for (; rnp0 != NULL; rnp0 = rnp0->parent) { + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone2, s)) + return NULL; + mutex_lock(&rnp0->exp_funnel_mutex); + if (rnp1) + mutex_unlock(&rnp1->exp_funnel_mutex); + else + mutex_unlock(&rdp->exp_funnel_mutex); + rnp1 = rnp0; + } + if (sync_exp_work_done(rsp, rnp1, rdp, + &rsp->expedited_workdone3, s)) + return NULL; + return rnp1; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + struct rcu_data *rdp = data; + struct rcu_state *rsp = rdp->rsp; + + /* We are here: If we are last, do the wakeup. */ + rdp->exp_done = true; + if (atomic_dec_and_test(&rsp->expedited_need_qs)) + wake_up(&rsp->expedited_wq); return 0; } +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + struct rcu_data *rdp; + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = wait_event_interruptible_timeout( + rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs), + jiffies_stall); + if (ret > 0) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + wait_event(rsp->expedited_wq, + !atomic_read(&rsp->expedited_need_qs)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs: {", + rsp->name); + for_each_online_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + + if (rdp->exp_done) + continue; + pr_cont(" %d", cpu); + } + pr_cont(" } %lu jiffies s: %lu\n", + jiffies - jiffies_start, rsp->expedited_sequence); + for_each_online_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + + if (rdp->exp_done) + continue; + dump_cpu_task(cpu); + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + /** * synchronize_sched_expedited - Brute-force RCU-sched grace period * @@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data) * restructure your code to batch your updates, and then use a single * synchronize_sched() instead. * - * This implementation can be thought of as an application of ticket - * locking to RCU, with sync_sched_expedited_started and - * sync_sched_expedited_done taking on the roles of the halves - * of the ticket-lock word. Each task atomically increments - * sync_sched_expedited_started upon entry, snapshotting the old value, - * then attempts to stop all the CPUs. If this succeeds, then each - * CPU will have executed a context switch, resulting in an RCU-sched - * grace period. We are then done, so we use atomic_cmpxchg() to - * update sync_sched_expedited_done to match our snapshot -- but - * only if someone else has not already advanced past our snapshot. - * - * On the other hand, if try_stop_cpus() fails, we check the value - * of sync_sched_expedited_done. If it has advanced past our - * initial snapshot, then someone else must have forced a grace period - * some time after we took our snapshot. In this case, our work is - * done for us, and we can simply return. Otherwise, we try again, - * but keep our initial snapshot for purposes of checking for someone - * doing our work for us. - * - * If we fail too many times in a row, we fall back to synchronize_sched(). + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. */ void synchronize_sched_expedited(void) { - cpumask_var_t cm; - bool cma = false; int cpu; - long firstsnap, s, snap; - int trycount = 0; + unsigned long s; + struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; - /* - * If we are in danger of counter wrap, just do synchronize_sched(). - * By allowing sync_sched_expedited_started to advance no more than - * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring - * that more than 3.5 billion CPUs would be required to force a - * counter wrap on a 32-bit system. Quite a few more CPUs would of - * course be required on a 64-bit system. - */ - if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), - (ulong)atomic_long_read(&rsp->expedited_done) + - ULONG_MAX / 8)) { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_wrap); - return; - } + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); - /* - * Take a ticket. Note that atomic_inc_return() implies a - * full memory barrier. - */ - snap = atomic_long_inc_return(&rsp->expedited_start); - firstsnap = snap; if (!try_get_online_cpus()) { /* CPU hotplug operation in flight, fall back to normal GP. */ wait_rcu_gp(call_rcu_sched); @@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void) } WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); - /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ - cma = zalloc_cpumask_var(&cm, GFP_KERNEL); - if (cma) { - cpumask_copy(cm, cpu_online_mask); - cpumask_clear_cpu(raw_smp_processor_id(), cm); - for_each_cpu(cpu, cm) { - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - cpumask_clear_cpu(cpu, cm); - } - if (cpumask_weight(cm) == 0) - goto all_cpus_idle; + rnp = exp_funnel_lock(rsp, s); + if (rnp == NULL) { + put_online_cpus(); + return; /* Someone else did our work for us. */ } - /* - * Each pass through the following loop attempts to force a - * context switch on each CPU. - */ - while (try_stop_cpus(cma ? cm : cpu_online_mask, - synchronize_sched_expedited_cpu_stop, - NULL) == -EAGAIN) { - put_online_cpus(); - atomic_long_inc(&rsp->expedited_tryfail); - - /* Check to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone1); - free_cpumask_var(cm); - return; - } + rcu_exp_gp_seq_start(rsp); - /* No joy, try again later. Or just synchronize_sched(). */ - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); - return; - } + /* Stop each CPU that is online, non-idle, and not us. */ + init_waitqueue_head(&rsp->expedited_wq); + atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */ + for_each_online_cpu(cpu) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - /* Recheck to see if someone else did our work for us. */ - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_workdone2); - free_cpumask_var(cm); - return; - } + rdp->exp_done = false; - /* - * Refetching sync_sched_expedited_started allows later - * callers to piggyback on our grace period. We retry - * after they started, so our grace period works for them, - * and they started after our first try, so their grace - * period works for us. - */ - if (!try_get_online_cpus()) { - /* CPU hotplug operation in flight, use normal GP. */ - wait_rcu_gp(call_rcu_sched); - atomic_long_inc(&rsp->expedited_normal); - free_cpumask_var(cm); - return; - } - snap = atomic_long_read(&rsp->expedited_start); - smp_mb(); /* ensure read is before try_stop_cpus(). */ + /* Skip our CPU and any idle CPUs. */ + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + continue; + atomic_inc(&rsp->expedited_need_qs); + stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop, + rdp, &rdp->exp_stop_work); } - atomic_long_inc(&rsp->expedited_stoppedcpus); -all_cpus_idle: - free_cpumask_var(cm); + /* Remove extra count and, if necessary, wait for CPUs to stop. */ + if (!atomic_dec_and_test(&rsp->expedited_need_qs)) + synchronize_sched_expedited_wait(rsp); - /* - * Everyone up to our most recent fetch is covered by our grace - * period. Update the counter, but only if our work is still - * relevant -- which it won't be if someone who started later - * than we did already did their update. - */ - do { - atomic_long_inc(&rsp->expedited_done_tries); - s = atomic_long_read(&rsp->expedited_done); - if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { - /* ensure test happens before caller kfree */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(&rsp->expedited_done_lost); - break; - } - } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); - atomic_long_inc(&rsp->expedited_done_exit); + rcu_exp_gp_seq_end(rsp); + mutex_unlock(&rnp->exp_funnel_mutex); put_online_cpus(); } @@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp) struct rcu_state *rsp = rdp->rsp; if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { - _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); complete(&rsp->barrier_completion); } else { - _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); } } @@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type) struct rcu_state *rsp = type; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); + _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); atomic_inc(&rsp->barrier_cpu_count); rsp->call(&rdp->barrier_head, rcu_barrier_callback); } @@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp) { int cpu; struct rcu_data *rdp; - unsigned long snap = READ_ONCE(rsp->n_barrier_done); - unsigned long snap_done; + unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); - _rcu_barrier_trace(rsp, "Begin", -1, snap); + _rcu_barrier_trace(rsp, "Begin", -1, s); /* Take mutex to serialize concurrent rcu_barrier() requests. */ mutex_lock(&rsp->barrier_mutex); - /* - * Ensure that all prior references, including to ->n_barrier_done, - * are ordered before the _rcu_barrier() machinery. - */ - smp_mb(); /* See above block comment. */ - - /* - * Recheck ->n_barrier_done to see if others did our work for us. - * This means checking ->n_barrier_done for an even-to-odd-to-even - * transition. The "if" expression below therefore rounds the old - * value up to the next even number and adds two before comparing. - */ - snap_done = rsp->n_barrier_done; - _rcu_barrier_trace(rsp, "Check", -1, snap_done); - - /* - * If the value in snap is odd, we needed to wait for the current - * rcu_barrier() to complete, then wait for the next one, in other - * words, we need the value of snap_done to be three larger than - * the value of snap. On the other hand, if the value in snap is - * even, we only had to wait for the next rcu_barrier() to complete, - * in other words, we need the value of snap_done to be only two - * greater than the value of snap. The "(snap + 3) & ~0x1" computes - * this for us (thank you, Linus!). - */ - if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { - _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); + /* Did someone else do our work for us? */ + if (rcu_seq_done(&rsp->barrier_sequence, s)) { + _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); return; } - /* - * Increment ->n_barrier_done to avoid duplicate work. Use - * WRITE_ONCE() to prevent the compiler from speculating - * the increment to precede the early-exit check. - */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); - _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); - smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ + /* Mark the start of the barrier operation. */ + rcu_seq_start(&rsp->barrier_sequence); + _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); /* * Initialize the count to one rather than to zero in order to @@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp) if (rcu_is_nocb_cpu(cpu)) { if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } else { _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_mb__before_atomic(); atomic_inc(&rsp->barrier_cpu_count); __call_rcu(&rdp->barrier_head, @@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp) } } else if (READ_ONCE(rdp->qlen)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); } else { _rcu_barrier_trace(rsp, "OnlineNQ", cpu, - rsp->n_barrier_done); + rsp->barrier_sequence); } } put_online_cpus(); @@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp) if (atomic_dec_and_test(&rsp->barrier_cpu_count)) complete(&rsp->barrier_completion); - /* Increment ->n_barrier_done to prevent duplicate work. */ - smp_mb(); /* Keep increment after above mechanism. */ - WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1); - WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); - _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); - smp_mb(); /* Keep increment before caller's subsequent code. */ - /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ wait_for_completion(&rsp->barrier_completion); + /* Mark the end of the barrier operation. */ + _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); + rcu_seq_end(&rsp->barrier_sequence); + /* Other rcu_barrier() invocations can now safely proceed. */ mutex_unlock(&rsp->barrier_mutex); } @@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); rdp->cpu = cpu; rdp->rsp = rsp; + mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on the rcu_fanout_exact boot parameter. */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) +static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) { int i; if (rcu_fanout_exact) { - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = RCU_FANOUT; + levelspread[i] = RCU_FANOUT; } else { int ccur; int cprv; cprv = nr_cpu_ids; for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; cprv = ccur; } } @@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) static void __init rcu_init_one(struct rcu_state *rsp, struct rcu_data __percpu *rda) { - static const char * const buf[] = { - "rcu_node_0", - "rcu_node_1", - "rcu_node_2", - "rcu_node_3" }; /* Match MAX_RCU_LVLS */ - static const char * const fqs[] = { - "rcu_node_fqs_0", - "rcu_node_fqs_1", - "rcu_node_fqs_2", - "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ + static const char * const buf[] = RCU_NODE_NAME_INIT; + static const char * const fqs[] = RCU_FQS_NAME_INIT; + static const char * const exp[] = RCU_EXP_NAME_INIT; + static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT; static u8 fl_mask = 0x1; + + int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ + int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; int j; struct rcu_node *rnp; - BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ /* Silence gcc 4.8 false positive about array index out of range. */ if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) @@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp, /* Initialize the level-tracking arrays. */ for (i = 0; i < rcu_num_lvls; i++) - rsp->levelcnt[i] = num_rcu_lvl[i]; + levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; - rcu_init_levelspread(rsp); + rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; + rcu_init_levelspread(levelspread, levelcnt); rsp->flavor_mask = fl_mask; fl_mask <<= 1; /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { - cpustride *= rsp->levelspread[i]; + cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + for (j = 0; j < levelcnt[i]; j++, rnp++) { raw_spin_lock_init(&rnp->lock); lockdep_set_class_and_name(&rnp->lock, &rcu_node_class[i], buf[i]); @@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp, rnp->grpmask = 0; rnp->parent = NULL; } else { - rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpnum = j % levelspread[i - 1]; rnp->grpmask = 1UL << rnp->grpnum; rnp->parent = rsp->level[i - 1] + - j / rsp->levelspread[i - 1]; + j / levelspread[i - 1]; } rnp->level = i; INIT_LIST_HEAD(&rnp->blkd_tasks); rcu_init_one_nocb(rnp); + mutex_init(&rnp->exp_funnel_mutex); + if (rsp == &rcu_sched_state) + lockdep_set_class_and_name( + &rnp->exp_funnel_mutex, + &rcu_exp_sched_class[i], exp_sched[i]); + else + lockdep_set_class_and_name( + &rnp->exp_funnel_mutex, + &rcu_exp_class[i], exp[i]); } } @@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void) { ulong d; int i; - int j; - int n = nr_cpu_ids; - int rcu_capacity[MAX_RCU_LVLS + 1]; + int rcu_capacity[RCU_NUM_LVLS]; /* * Initialize any unspecified boot parameters. @@ -4101,47 +4216,49 @@ static void __init rcu_init_geometry(void) rcu_fanout_leaf, nr_cpu_ids); /* - * Compute number of nodes that can be handled an rcu_node tree - * with the given number of levels. Setting rcu_capacity[0] makes - * some of the arithmetic easier. - */ - rcu_capacity[0] = 1; - rcu_capacity[1] = rcu_fanout_leaf; - for (i = 2; i <= MAX_RCU_LVLS; i++) - rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; - - /* * The boot-time rcu_fanout_leaf parameter is only permitted * to increase the leaf-level fanout, not decrease it. Of course, * the leaf-level fanout cannot exceed the number of bits in - * the rcu_node masks. Finally, the tree must be able to accommodate - * the configured number of CPUs. Complain and fall back to the - * compile-time values if these limits are exceeded. + * the rcu_node masks. Complain and fall back to the compile- + * time values if these limits are exceeded. */ if (rcu_fanout_leaf < RCU_FANOUT_LEAF || - rcu_fanout_leaf > sizeof(unsigned long) * 8 || - n > rcu_capacity[MAX_RCU_LVLS]) { + rcu_fanout_leaf > sizeof(unsigned long) * 8) { + rcu_fanout_leaf = RCU_FANOUT_LEAF; WARN_ON(1); return; } + /* + * Compute number of nodes that can be handled an rcu_node tree + * with the given number of levels. + */ + rcu_capacity[0] = rcu_fanout_leaf; + for (i = 1; i < RCU_NUM_LVLS; i++) + rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; + + /* + * The tree must be able to accommodate the configured number of CPUs. + * If this limit is exceeded than we have a serious problem elsewhere. + */ + if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) + panic("rcu_init_geometry: rcu_capacity[] is too small"); + + /* Calculate the number of levels in the tree. */ + for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { + } + rcu_num_lvls = i + 1; + /* Calculate the number of rcu_nodes at each level of the tree. */ - for (i = 1; i <= MAX_RCU_LVLS; i++) - if (n <= rcu_capacity[i]) { - for (j = 0; j <= i; j++) - num_rcu_lvl[j] = - DIV_ROUND_UP(n, rcu_capacity[i - j]); - rcu_num_lvls = i; - for (j = i + 1; j <= MAX_RCU_LVLS; j++) - num_rcu_lvl[j] = 0; - break; - } + for (i = 0; i < rcu_num_lvls; i++) { + int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; + num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); + } /* Calculate the total number of rcu_node structures. */ rcu_num_nodes = 0; - for (i = 0; i <= MAX_RCU_LVLS; i++) + for (i = 0; i < rcu_num_lvls; i++) rcu_num_nodes += num_rcu_lvl[i]; - rcu_num_nodes -= n; } /* diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 4adb7ca0bf47..2e991f8361e4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/stop_machine.h> /* * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and @@ -36,8 +37,6 @@ * Of course, your mileage may vary. */ -#define MAX_RCU_LVLS 4 - #ifdef CONFIG_RCU_FANOUT #define RCU_FANOUT CONFIG_RCU_FANOUT #else /* #ifdef CONFIG_RCU_FANOUT */ @@ -66,38 +65,53 @@ #if NR_CPUS <= RCU_FANOUT_1 # define RCU_NUM_LVLS 1 # define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 (NR_CPUS) -# define NUM_RCU_LVL_2 0 -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0" } #elif NR_CPUS <= RCU_FANOUT_2 # define RCU_NUM_LVLS 2 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_2 (NR_CPUS) -# define NUM_RCU_LVL_3 0 -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" } #elif NR_CPUS <= RCU_FANOUT_3 # define RCU_NUM_LVLS 3 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_3 (NR_CPUS) -# define NUM_RCU_LVL_4 0 +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" } #elif NR_CPUS <= RCU_FANOUT_4 # define RCU_NUM_LVLS 4 # define NUM_RCU_LVL_0 1 # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_LVL_4 (NR_CPUS) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" } +# define RCU_EXP_SCHED_NAME_INIT \ + { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" } #else # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ -#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) -#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) - extern int rcu_num_lvls; extern int rcu_num_nodes; @@ -236,6 +250,8 @@ struct rcu_node { int need_future_gp[2]; /* Counts of upcoming no-CB GP requests. */ raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; + + struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp; } ____cacheline_internodealigned_in_smp; /* @@ -287,12 +303,13 @@ struct rcu_data { bool gpwrap; /* Possible gpnum/completed wrap. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned long ticks_this_gp; /* The number of scheduling-clock */ /* ticks this CPU has handled */ /* during and after the last grace */ /* period it is aware of. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ + struct cpu_stop_work exp_stop_work; + /* Expedited grace-period control */ + /* for CPU stopping. */ /* 2) batch handling */ /* @@ -355,11 +372,13 @@ struct rcu_data { unsigned long n_rp_nocb_defer_wakeup; unsigned long n_rp_need_nothing; - /* 6) _rcu_barrier() and OOM callbacks. */ + /* 6) _rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; #ifdef CONFIG_RCU_FAST_NO_HZ struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ + struct mutex exp_funnel_mutex; + bool exp_done; /* Expedited QS for this CPU? */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -387,9 +406,7 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ /* 8) RCU CPU stall data. */ -#ifdef CONFIG_RCU_CPU_STALL_INFO unsigned int softirq_snap; /* Snapshot of softirq activity. */ -#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ int cpu; struct rcu_state *rsp; @@ -442,9 +459,9 @@ do { \ */ struct rcu_state { struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ - struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ - u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ - u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ + struct rcu_node *level[RCU_NUM_LVLS + 1]; + /* Hierarchy levels (+1 to */ + /* shut bogus gcc warning) */ u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ @@ -479,21 +496,18 @@ struct rcu_state { struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ - unsigned long n_barrier_done; /* ++ at start and end of */ + unsigned long barrier_sequence; /* ++ at start and end of */ /* _rcu_barrier(). */ /* End of fields guarded by barrier_mutex. */ - atomic_long_t expedited_start; /* Starting ticket. */ - atomic_long_t expedited_done; /* Done ticket. */ - atomic_long_t expedited_wrap; /* # near-wrap incidents. */ - atomic_long_t expedited_tryfail; /* # acquisition failures. */ + unsigned long expedited_sequence; /* Take a ticket. */ + atomic_long_t expedited_workdone0; /* # done by others #0. */ atomic_long_t expedited_workdone1; /* # done by others #1. */ atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ - atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ - atomic_long_t expedited_done_tries; /* # tries to update _done. */ - atomic_long_t expedited_done_lost; /* # times beaten to _done. */ - atomic_long_t expedited_done_exit; /* # times exited _done loop. */ + atomic_t expedited_need_qs; /* # CPUs left to check in. */ + wait_queue_head_t expedited_wq; /* Wait for check-ins. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ @@ -527,7 +541,11 @@ struct rcu_state { /* Values for rcu_state structure's gp_flags field. */ #define RCU_GP_WAIT_INIT 0 /* Initial state. */ #define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ -#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */ +#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */ +#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */ +#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ +#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ extern struct list_head rcu_struct_flavors; @@ -635,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ } #endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifdef CONFIG_PPC +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ +#else /* #ifdef CONFIG_PPC */ +#define smp_mb__after_unlock_lock() do { } while (0) +#endif /* #else #ifdef CONFIG_PPC */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 013485fb2b06..b2bf3963a0ae 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) pr_info("\tRCU torture testing starts during boot.\n"); - if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); - if (NUM_RCU_LVL_4 != 0) - pr_info("\tFour-level hierarchy is enabled.\n"); + if (RCU_NUM_LVLS >= 4) + pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", RCU_FANOUT_LEAF); @@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp) rcu_print_detail_task_stall_rnp(rnp); } -#ifdef CONFIG_RCU_CPU_STALL_INFO - static void rcu_print_task_stall_begin(struct rcu_node *rnp) { pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void) pr_cont("\n"); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void rcu_print_task_stall_begin(struct rcu_node *rnp) -{ -} - -static void rcu_print_task_stall_end(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. @@ -538,10 +522,10 @@ EXPORT_SYMBOL_GPL(call_rcu); */ void synchronize_rcu(void) { - rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && - !lock_is_held(&rcu_lock_map) && - !lock_is_held(&rcu_sched_lock_map), - "Illegal synchronize_rcu() in RCU read-side critical section"); + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; if (rcu_gp_is_expedited()) @@ -552,8 +536,6 @@ void synchronize_rcu(void) EXPORT_SYMBOL_GPL(synchronize_rcu); static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); -static unsigned long sync_rcu_preempt_exp_count; -static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); /* * Return non-zero if there are any tasks in RCU read-side critical @@ -573,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp) * for the current expedited grace period. Works only for preemptible * RCU -- other RCU implementation use other means. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) { @@ -589,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, bool wake) @@ -628,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 * that work is needed here. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) @@ -671,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, * enabling rcu_read_unlock_special() to do the bit-clearing. * - * Caller must hold sync_rcu_preempt_exp_mutex. + * Caller must hold the root rcu_node's exp_funnel_mutex. */ static void sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) @@ -719,51 +701,17 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) void synchronize_rcu_expedited(void) { struct rcu_node *rnp; + struct rcu_node *rnp_unlock; struct rcu_state *rsp = rcu_state_p; - unsigned long snap; - int trycount = 0; + unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1; - smp_mb(); /* Above access cannot bleed into critical section. */ + s = rcu_exp_gp_seq_snap(rsp); - /* - * Block CPU-hotplug operations. This means that any CPU-hotplug - * operation that finds an rcu_node structure with tasks in the - * process of being boosted will know that all tasks blocking - * this expedited grace period will already be in the process of - * being boosted. This simplifies the process of moving tasks - * from leaf to root rcu_node structures. - */ - if (!try_get_online_cpus()) { - /* CPU-hotplug operation in flight, fall back to normal GP. */ - wait_rcu_gp(call_rcu); - return; - } + rnp_unlock = exp_funnel_lock(rsp, s); + if (rnp_unlock == NULL) + return; /* Someone else did our work for us. */ - /* - * Acquire lock, falling back to synchronize_rcu() if too many - * lock-acquisition failures. Of course, if someone does the - * expedited grace period for us, just leave. - */ - while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { - if (ULONG_CMP_LT(snap, - READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto mb_ret; /* Others did our work for us. */ - } - if (trycount++ < 10) { - udelay(trycount * num_online_cpus()); - } else { - put_online_cpus(); - wait_rcu_gp(call_rcu); - return; - } - } - if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) { - put_online_cpus(); - goto unlock_mb_ret; /* Others did our work for us. */ - } + rcu_exp_gp_seq_start(rsp); /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); @@ -779,20 +727,14 @@ void synchronize_rcu_expedited(void) rcu_for_each_leaf_node(rsp, rnp) sync_rcu_preempt_exp_init2(rsp, rnp); - put_online_cpus(); - /* Wait for snapshotted ->blkd_tasks lists to drain. */ rnp = rcu_get_root(rsp); wait_event(sync_rcu_preempt_exp_wq, sync_rcu_preempt_exp_done(rnp)); /* Clean up and exit. */ - smp_mb(); /* ensure expedited GP seen before counter increment. */ - WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); -unlock_mb_ret: - mutex_unlock(&sync_rcu_preempt_exp_mutex); -mb_ret: - smp_mb(); /* ensure subsequent action seen after grace period. */ + rcu_exp_gp_seq_end(rsp); + mutex_unlock(&rnp_unlock->exp_funnel_mutex); } EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); @@ -1061,8 +1003,7 @@ static int rcu_boost(struct rcu_node *rnp) } /* - * Priority-boosting kthread. One per leaf rcu_node and one for the - * root rcu_node. + * Priority-boosting kthread, one per leaf rcu_node. */ static int rcu_boost_kthread(void *arg) { @@ -1680,12 +1621,10 @@ static int rcu_oom_notify(struct notifier_block *self, */ atomic_set(&oom_callback_count, 1); - get_online_cpus(); for_each_online_cpu(cpu) { smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); cond_resched_rcu_qs(); } - put_online_cpus(); /* Unconditionally decrement: no need to wake ourselves up. */ atomic_dec(&oom_callback_count); @@ -1706,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier); #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ -#ifdef CONFIG_RCU_CPU_STALL_INFO - #ifdef CONFIG_RCU_FAST_NO_HZ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) @@ -1796,33 +1733,6 @@ static void increment_cpu_stall_ticks(void) raw_cpu_inc(rsp->rda->ticks_this_gp); } -#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ - -static void print_cpu_stall_info_begin(void) -{ - pr_cont(" {"); -} - -static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) -{ - pr_cont(" %d", cpu); -} - -static void print_cpu_stall_info_end(void) -{ - pr_cont("} "); -} - -static void zero_cpu_stall_ticks(struct rcu_data *rdp) -{ -} - -static void increment_cpu_stall_ticks(void) -{ -} - -#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ - #ifdef CONFIG_RCU_NOCB_CPU /* diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 3ea7ffc7d5c4..6fc4c5ff3bb5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v) static int show_rcubarrier(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "bcc: %d nbd: %lu\n", + seq_printf(m, "bcc: %d bseq: %lu\n", atomic_read(&rsp->barrier_cpu_count), - rsp->n_barrier_done); + rsp->barrier_sequence); return 0; } @@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v) { struct rcu_state *rsp = (struct rcu_state *)m->private; - seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", - atomic_long_read(&rsp->expedited_start), - atomic_long_read(&rsp->expedited_done), - atomic_long_read(&rsp->expedited_wrap), - atomic_long_read(&rsp->expedited_tryfail), + seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", + rsp->expedited_sequence, + atomic_long_read(&rsp->expedited_workdone0), atomic_long_read(&rsp->expedited_workdone1), atomic_long_read(&rsp->expedited_workdone2), + atomic_long_read(&rsp->expedited_workdone3), atomic_long_read(&rsp->expedited_normal), - atomic_long_read(&rsp->expedited_stoppedcpus), - atomic_long_read(&rsp->expedited_done_tries), - atomic_long_read(&rsp->expedited_done_lost), - atomic_long_read(&rsp->expedited_done_exit)); + atomic_read(&rsp->expedited_need_qs), + rsp->expedited_sequence / 2); return 0; } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index afaecb7a799a..7a0b3bc7c5ed 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) +/** + * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section? + * + * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an + * RCU-sched read-side critical section. In absence of + * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side + * critical section unless it can prove otherwise. Note that disabling + * of preemption (including disabling irqs) counts as an RCU-sched + * read-side critical section. This is useful for debug checks in functions + * that required that they be called within an RCU-sched read-side + * critical section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot + * and while lockdep is disabled. + * + * Note that if the CPU is in the idle loop from an RCU point of + * view (ie: that we are in the section between rcu_idle_enter() and + * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU + * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs + * that are in such a section, considering these as in extended quiescent + * state, so such a CPU is effectively never in an RCU read-side critical + * section regardless of what RCU primitives it invokes. This state of + * affairs is required --- we need to keep an RCU-free window in idle + * where the CPU may possibly enter into low power mode. This way we can + * notice an extended quiescent state to other CPUs that started a grace + * period. Otherwise we would delay any grace period as long as we run in + * the idle task. + * + * Similarly, we avoid claiming an SRCU read lock held if the current + * CPU is offline. + */ +int rcu_read_lock_sched_held(void) +{ + int lockdep_opinion = 0; + + if (!debug_lockdep_rcu_enabled()) + return 1; + if (!rcu_is_watching()) + return 0; + if (!rcu_lockdep_current_cpu_online()) + return 0; + if (debug_locks) + lockdep_opinion = lock_is_held(&rcu_sched_lock_map); + return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); +} +EXPORT_SYMBOL(rcu_read_lock_sched_held); +#endif + #ifndef CONFIG_TINY_RCU static atomic_t rcu_expedited_nesting = @@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } +EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void wait_rcu_gp(call_rcu_func_t crf) +void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, + struct rcu_synchronize *rs_array) { - struct rcu_synchronize rcu; + int i; - init_rcu_head_on_stack(&rcu.head); - init_completion(&rcu.completion); - /* Will wake me after RCU finished. */ - crf(&rcu.head, wakeme_after_rcu); - /* Wait for it. */ - wait_for_completion(&rcu.completion); - destroy_rcu_head_on_stack(&rcu.head); + /* Initialize and register callbacks for each flavor specified. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) { + might_sleep(); + continue; + } + init_rcu_head_on_stack(&rs_array[i].head); + init_completion(&rs_array[i].completion); + (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu); + } + + /* Wait for all callbacks to be invoked. */ + for (i = 0; i < n; i++) { + if (checktiny && + (crcu_array[i] == call_rcu || + crcu_array[i] == call_rcu_bh)) + continue; + wait_for_completion(&rs_array[i].completion); + destroy_rcu_head_on_stack(&rs_array[i].head); + } } -EXPORT_SYMBOL_GPL(wait_rcu_gp); +EXPORT_SYMBOL_GPL(__wait_rcu_gp); #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head) @@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks); void synchronize_rcu_tasks(void) { /* Complain if the scheduler has not started. */ - rcu_lockdep_assert(!rcu_scheduler_active, - "synchronize_rcu_tasks called too soon"); + RCU_LOCKDEP_WARN(!rcu_scheduler_active, + "synchronize_rcu_tasks called too soon"); /* Wait for the grace period. */ wait_rcu_gp(call_rcu_tasks); diff --git a/kernel/resource.c b/kernel/resource.c index 90552aab5f2d..fed052a1bc9f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -504,13 +504,13 @@ int region_is_ram(resource_size_t start, unsigned long size) { struct resource *p; resource_size_t end = start + size - 1; - int flags = IORESOURCE_MEM | IORESOURCE_BUSY; + unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; const char *name = "System RAM"; int ret = -1; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - if (end < p->start) + if (p->end < start) continue; if (p->start <= start && end <= p->end) { @@ -521,7 +521,7 @@ int region_is_ram(resource_size_t start, unsigned long size) ret = 1; break; } - if (p->end < start) + if (end < p->start) break; /* not found */ } read_unlock(&resource_lock); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9917c962be99..a585c7b2ccf0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2255,8 +2255,8 @@ unsigned long to_ratio(u64 period, u64 runtime) #ifdef CONFIG_SMP inline struct dl_bw *dl_bw_of(int i) { - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); return &cpu_rq(i)->rd->dl_bw; } @@ -2265,8 +2265,8 @@ static inline int dl_bw_cpus(int i) struct root_domain *rd = cpu_rq(i)->rd; int cpus = 0; - rcu_lockdep_assert(rcu_read_lock_sched_held(), - "sched RCU must be held"); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); for_each_cpu_and(i, rd->span, cpu_active_mask) cpus++; diff --git a/kernel/signal.c b/kernel/signal.c index 836df8dac6cc..0f6bbbe77b46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif #ifdef SEGV_BNDERR - err |= __put_user(from->si_lower, &to->si_lower); - err |= __put_user(from->si_upper, &to->si_upper); + if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) { + err |= __put_user(from->si_lower, &to->si_lower); + err |= __put_user(from->si_upper, &to->si_upper); + } #endif break; case __SI_CHLD: @@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + siginfo_t info = {}; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + siginfo_t info = {}; if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 579ce1b929af..4008d9f95dd7 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -92,12 +92,10 @@ config NO_HZ_FULL depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP - # RCU_USER_QS dependency depends on HAVE_CONTEXT_TRACKING # VIRT_CPU_ACCOUNTING_GEN dependency depends on HAVE_VIRT_CPU_ACCOUNTING_GEN select NO_HZ_COMMON - select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 08ccc3da3ca0..50eb107f1198 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -120,19 +120,25 @@ static int __clockevents_switch_state(struct clock_event_device *dev, /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: - return dev->set_state_shutdown(dev); + if (dev->set_state_shutdown) + return dev->set_state_shutdown(dev); + return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; - return dev->set_state_periodic(dev); + if (dev->set_state_periodic) + return dev->set_state_periodic(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; - return dev->set_state_oneshot(dev); + if (dev->set_state_oneshot) + return dev->set_state_oneshot(dev); + return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ @@ -471,18 +477,6 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; - /* New state-specific callbacks */ - if (!dev->set_state_shutdown) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !dev->set_state_periodic) - return -EINVAL; - - if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && - !dev->set_state_oneshot) - return -EINVAL; - return 0; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index d39f32cdd1b5..f6aae7977824 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -159,7 +159,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { struct clock_event_device *bc = tick_broadcast_device.evtdev; unsigned long flags; - int ret; + int ret = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -221,13 +221,14 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) * If we kept the cpu in the broadcast mask, * tell the caller to leave the per cpu device * in shutdown state. The periodic interrupt - * is delivered by the broadcast device. + * is delivered by the broadcast device, if + * the broadcast device exists and is not + * hrtimer based. */ - ret = cpumask_test_cpu(cpu, tick_broadcast_mask); + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER)) + ret = cpumask_test_cpu(cpu, tick_broadcast_mask); break; default: - /* Nothing to do */ - ret = 0; break; } } @@ -265,8 +266,22 @@ static bool tick_do_broadcast(struct cpumask *mask) * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { + struct clock_event_device *bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, mask); - local = true; + /* + * We only run the local handler, if the broadcast + * device is not hrtimer based. Otherwise we run into + * a hrtimer recursion. + * + * local timer_interrupt() + * local_handler() + * expire_hrtimers() + * bc_handler() + * local_handler() + * expire_hrtimers() + */ + local = !(bc->features & CLOCK_EVT_FEAT_HRTIMER); } if (!cpumask_empty(mask)) { @@ -301,6 +316,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bool bc_local; raw_spin_lock(&tick_broadcast_lock); + + /* Handle spurious interrupts gracefully */ + if (clockevent_state_shutdown(tick_broadcast_device.evtdev)) { + raw_spin_unlock(&tick_broadcast_lock); + return; + } + bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { @@ -359,8 +381,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { - if (tick_broadcast_device.mode == - TICKDEV_MODE_PERIODIC) + /* + * Only shutdown the cpu local device, if: + * + * - the broadcast device exists + * - the broadcast device is not a hrtimer based one + * - the broadcast device is in periodic mode to + * avoid a hickup during switch to oneshot mode + */ + if (bc && !(bc->features & CLOCK_EVT_FEAT_HRTIMER) && + tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } break; @@ -379,14 +409,16 @@ void tick_broadcast_control(enum tick_broadcast_mode mode) break; } - if (cpumask_empty(tick_broadcast_mask)) { - if (!bc_stopped) - clockevents_shutdown(bc); - } else if (bc_stopped) { - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - tick_broadcast_start_periodic(bc); - else - tick_broadcast_setup_oneshot(bc); + if (bc) { + if (cpumask_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } } raw_spin_unlock(&tick_broadcast_lock); } @@ -662,71 +694,82 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -/** - * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode - * @state: The target state (enter/exit) - * - * The system enters/leaves a state, where affected devices might stop - * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. - * - * Called with interrupts disabled, so clockevents_lock is not - * required here because the local clock event device cannot go away - * under us. - */ -int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; - struct tick_device *td; int cpu, ret = 0; ktime_t now; /* - * Periodic mode does not care about the enter/exit of power - * states + * If there is no broadcast device, tell the caller not to go + * into deep idle. */ - if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - return 0; + if (!tick_broadcast_device.evtdev) + return -EBUSY; - /* - * We are called with preemtion disabled from the depth of the - * idle code, so we can't be moved away. - */ - td = this_cpu_ptr(&tick_cpu_device); - dev = td->evtdev; - - if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - return 0; + dev = this_cpu_ptr(&tick_cpu_device)->evtdev; raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; cpu = smp_processor_id(); if (state == TICK_BROADCAST_ENTER) { + /* + * If the current CPU owns the hrtimer broadcast + * mechanism, it cannot go deep idle and we do not add + * the CPU to the broadcast mask. We don't have to go + * through the EXIT path as the local timer is not + * shutdown. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) + goto out; + + /* + * If the broadcast device is in periodic mode, we + * return. + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + /* If it is a hrtimer based broadcast, return busy */ + if (bc->features & CLOCK_EVT_FEAT_HRTIMER) + ret = -EBUSY; + goto out; + } + if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); + + /* Conditionally shut down the local timer. */ broadcast_shutdown_local(bc, dev); + /* * We only reprogram the broadcast timer if we * did not mark ourself in the force mask and * if the cpu local event is earlier than the * broadcast event. If the current CPU is in * the force mask, then we are going to be - * woken by the IPI right away. + * woken by the IPI right away; we return + * busy, so the CPU does not try to go deep + * idle. */ - if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && - dev->next_event.tv64 < bc->next_event.tv64) + if (cpumask_test_cpu(cpu, tick_broadcast_force_mask)) { + ret = -EBUSY; + } else if (dev->next_event.tv64 < bc->next_event.tv64) { tick_broadcast_set_event(bc, cpu, dev->next_event); + /* + * In case of hrtimer broadcasts the + * programming might have moved the + * timer to this cpu. If yes, remove + * us from the broadcast mask and + * return busy. + */ + ret = broadcast_needs_cpu(bc, cpu); + if (ret) { + cpumask_clear_cpu(cpu, + tick_broadcast_oneshot_mask); + } + } } - /* - * If the current CPU owns the hrtimer broadcast - * mechanism, it cannot go deep idle and we remove the - * CPU from the broadcast mask. We don't have to go - * through the EXIT path as the local timer is not - * shutdown. - */ - ret = broadcast_needs_cpu(bc, cpu); - if (ret) - cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); @@ -796,7 +839,6 @@ out: raw_spin_unlock(&tick_broadcast_lock); return ret; } -EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu @@ -938,6 +980,16 @@ bool tick_broadcast_oneshot_available(void) return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; } +#else +int __tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + if (!bc || (bc->features & CLOCK_EVT_FEAT_HRTIMER)) + return -EBUSY; + + return 0; +} #endif void __init tick_broadcast_init(void) diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 76446cb5dfe1..f8bf47571dda 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -343,6 +343,28 @@ out_bc: tick_install_broadcast_device(newdev); } +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop + * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. + */ +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + + if (!(td->evtdev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + return __tick_broadcast_oneshot_control(state); +} +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); + #ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 42fdf4958bcc..a4a8d4e9baa1 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -71,4 +71,14 @@ extern void tick_cancel_sched_timer(int cpu); static inline void tick_cancel_sched_timer(int cpu) { } #endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int __tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int +__tick_broadcast_oneshot_control(enum tick_broadcast_state state) +{ + return -EBUSY; +} +#endif + #endif diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5e097fa9faf7..84190f02b521 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -807,8 +807,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer->flags &= ~TIMER_BASEMASK; - timer->flags |= base->cpu; + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | base->cpu); } } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3b9a48ae153a..1153c43428f3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT + depends on KPROBE_EVENT || UPROBE_EVENT bool default y help diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 02bece4a99ea..eb11011b5292 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -98,6 +98,13 @@ struct ftrace_pid { struct pid *pid; }; +static bool ftrace_pids_enabled(void) +{ + return !list_empty(&ftrace_pids); +} + +static void ftrace_update_trampoline(struct ftrace_ops *ops); + /* * ftrace_disabled is set when an anomaly is discovered. * ftrace_disabled is much stronger than ftrace_enabled. @@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; -ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; static struct ftrace_ops control_ops; @@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, if (!test_tsk_trace_trace(current)) return; - ftrace_pid_function(ip, parent_ip, op, regs); -} - -static void set_ftrace_pid_function(ftrace_func_t func) -{ - /* do not set ftrace_pid_function to itself! */ - if (func != ftrace_pid_func) - ftrace_pid_function = func; + op->saved_func(ip, parent_ip, op, regs); } /** @@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func) void clear_ftrace_function(void) { ftrace_trace_function = ftrace_stub; - ftrace_pid_function = ftrace_stub; } static void control_ops_disable_all(struct ftrace_ops *ops) @@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) } else add_ftrace_ops(&ftrace_ops_list, ops); + /* Always save the function, and reset at unregistering */ + ops->saved_func = ops->func; + + if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled()) + ops->func = ftrace_pid_func; + ftrace_update_trampoline(ops); if (ftrace_enabled) @@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (ftrace_enabled) update_ftrace_function(); + ops->func = ops->saved_func; + return 0; } static void ftrace_update_pid_func(void) { + bool enabled = ftrace_pids_enabled(); + struct ftrace_ops *op; + /* Only do something if we are tracing something */ if (ftrace_trace_function == ftrace_stub) return; + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->flags & FTRACE_OPS_FL_PID) { + op->func = enabled ? ftrace_pid_func : + op->saved_func; + ftrace_update_trampoline(op); + } + } while_for_each_ftrace_op(op); + update_ftrace_function(); } @@ -1133,7 +1150,8 @@ static struct ftrace_ops global_ops = { .local_hash.filter_hash = EMPTY_HASH, INIT_OPS_HASH(global_ops) .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED, + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID, }; /* @@ -5023,7 +5041,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops) static struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | + FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID, }; static int __init ftrace_nodyn_init(void) @@ -5080,11 +5100,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) if (WARN_ON(tr->ops->func != ftrace_stub)) printk("ftrace ops had %pS for function\n", tr->ops->func); - /* Only the top level instance does pid tracing */ - if (!list_empty(&ftrace_pids)) { - set_ftrace_pid_function(func); - func = ftrace_pid_func; - } } tr->ops->func = func; tr->ops->private = tr; @@ -5371,7 +5386,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos) { mutex_lock(&ftrace_lock); - if (list_empty(&ftrace_pids) && (!*pos)) + if (!ftrace_pids_enabled() && (!*pos)) return (void *) 1; return seq_list_start(&ftrace_pids, *pos); @@ -5610,6 +5625,7 @@ static struct ftrace_ops graph_ops = { .func = ftrace_stub, .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED | + FTRACE_OPS_FL_PID | FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR .trampoline = FTRACE_GRAPH_TRAMP_ADDR, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f060716b02ae..74bde81601a9 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,7 @@ enum { TRACE_CONTROL_BIT, + TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index a87b43f49eb4..e2e12ad3186f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -36,9 +36,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_branch *entry; struct ring_buffer *buffer; unsigned long flags; - int cpu, pc; + int pc; const char *p; + if (current->trace_recursion & TRACE_BRANCH_BIT) + return; + /* * I would love to save just the ftrace_likely_data pointer, but * this code can also be used by modules. Ugly things can happen @@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (unlikely(!tr)) return; - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); - if (atomic_inc_return(&data->disabled) != 1) + raw_local_irq_save(flags); + current->trace_recursion |= TRACE_BRANCH_BIT; + data = this_cpu_ptr(tr->trace_buffer.data); + if (atomic_read(&data->disabled)) goto out; pc = preempt_count(); @@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) __buffer_unlock_commit(buffer, event); out: - atomic_dec(&data->disabled); - local_irq_restore(flags); + current->trace_recursion &= ~TRACE_BRANCH_BIT; + raw_local_irq_restore(flags); } static inline diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index aa1ea7b36fa8..d2f6d0be3503 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v) seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, trace_event_name(&tu->tp.call)); - seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); + seq_printf(m, " %s:", tu->filename); + + /* Don't print "0x (null)" when offset is 0 */ + if (tu->offset) { + seq_printf(m, "0x%p", (void *)tu->offset); + } else { + switch (sizeof(void *)) { + case 4: + seq_printf(m, "0x00000000"); + break; + case 8: + default: + seq_printf(m, "0x0000000000000000"); + break; + } + } for (i = 0; i < tu->tp.nr_args; i++) seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); @@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, { struct trace_event_call *call = &tu->tp.call; struct uprobe_trace_entry_head *entry; + struct bpf_prog *prog = call->prog; struct hlist_head *head; void *data; int size, esize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); size = esize + tu->tp.size + dsize; @@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu) return -ENODEV; } + call->flags = TRACE_EVENT_FL_UPROBE; call->class->reg = trace_uprobe_register; call->data = tu; ret = trace_add_event_call(call); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f5782d5fd196..811edb77dd6d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex), \ - "sched RCU or wq->mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex), \ + "sched RCU or wq->mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - rcu_lockdep_assert(rcu_read_lock_sched_held() || \ - lockdep_is_held(&wq->mutex) || \ - lockdep_is_held(&wq_pool_mutex), \ - "sched RCU, wq->mutex or wq_pool_mutex should be held") + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ + !lockdep_is_held(&wq->mutex) && \ + !lockdep_is_held(&wq_pool_mutex), \ + "sched RCU, wq->mutex or wq_pool_mutex should be held") #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ |