summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/bpf/arraymap.c2
-rw-r--r--kernel/events/core.c492
-rw-r--r--kernel/events/ring_buffer.c2
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/irq/timings.c2
-rw-r--r--kernel/irq_work.c11
-rw-r--r--kernel/jump_label.c14
-rw-r--r--kernel/kallsyms.c43
-rw-r--r--kernel/kprobes.c18
-rw-r--r--kernel/locking/lockdep.c23
-rw-r--r--kernel/locking/qrwlock.c86
-rw-r--r--kernel/locking/qspinlock_paravirt.h47
-rw-r--r--kernel/locking/rwsem.c16
-rw-r--r--kernel/locking/spinlock.c9
-rw-r--r--kernel/module.c32
-rw-r--r--kernel/rcu/rcu.h21
-rw-r--r--kernel/rcu/rcu_segcblist.c1
-rw-r--r--kernel/rcu/rcutorture.c24
-rw-r--r--kernel/rcu/tree.c175
-rw-r--r--kernel/rcu/tree.h5
-rw-r--r--kernel/rcu/tree_plugin.h24
-rw-r--r--kernel/rcu/update.c25
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c5
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/cputime.c3
-rw-r--r--kernel/seccomp.c2
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c10
-rw-r--r--kernel/task_work.c2
-rw-r--r--kernel/test_kprobes.c29
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/posix-cpu-timers.c6
-rw-r--r--kernel/time/tick-sched.c5
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/workqueue.c25
41 files changed, 663 insertions, 526 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 6670fbd3e466..d15c0ee4d955 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -147,7 +147,7 @@ static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
again:
smp_rmb();
rcu_read_lock();
- res = to_acct(ACCESS_ONCE(ns->bacct));
+ res = to_acct(READ_ONCE(ns->bacct));
if (!res) {
rcu_read_unlock();
return NULL;
@@ -159,7 +159,7 @@ again:
}
rcu_read_unlock();
mutex_lock(&res->lock);
- if (res != to_acct(ACCESS_ONCE(ns->bacct))) {
+ if (res != to_acct(READ_ONCE(ns->bacct))) {
mutex_unlock(&res->lock);
acct_put(res);
goto again;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index e2636737b69b..c4b9ab01bba5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -492,7 +492,7 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
+ if (perf_event_read_local(event, &value, NULL, NULL) == -EOPNOTSUPP)
goto err_out;
ee = bpf_event_entry_gen(perf_file, map_file);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 10cdb9c26b5d..4c39c05e029a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -209,7 +209,7 @@ static int event_function(void *info)
struct perf_event_context *task_ctx = cpuctx->task_ctx;
int ret = 0;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
perf_ctx_lock(cpuctx, task_ctx);
/*
@@ -306,7 +306,7 @@ static void event_function_local(struct perf_event *event, event_f func, void *d
struct task_struct *task = READ_ONCE(ctx->task);
struct perf_event_context *task_ctx = NULL;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (task) {
if (task == TASK_TOMBSTONE)
@@ -582,6 +582,88 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
+/*
+ * State based event timekeeping...
+ *
+ * The basic idea is to use event->state to determine which (if any) time
+ * fields to increment with the current delta. This means we only need to
+ * update timestamps when we change state or when they are explicitly requested
+ * (read).
+ *
+ * Event groups make things a little more complicated, but not terribly so. The
+ * rules for a group are that if the group leader is OFF the entire group is
+ * OFF, irrespecive of what the group member states are. This results in
+ * __perf_effective_state().
+ *
+ * A futher ramification is that when a group leader flips between OFF and
+ * !OFF, we need to update all group member times.
+ *
+ *
+ * NOTE: perf_event_time() is based on the (cgroup) context time, and thus we
+ * need to make sure the relevant context time is updated before we try and
+ * update our timestamps.
+ */
+
+static __always_inline enum perf_event_state
+__perf_effective_state(struct perf_event *event)
+{
+ struct perf_event *leader = event->group_leader;
+
+ if (leader->state <= PERF_EVENT_STATE_OFF)
+ return leader->state;
+
+ return event->state;
+}
+
+static __always_inline void
+__perf_update_times(struct perf_event *event, u64 now, u64 *enabled, u64 *running)
+{
+ enum perf_event_state state = __perf_effective_state(event);
+ u64 delta = now - event->tstamp;
+
+ *enabled = event->total_time_enabled;
+ if (state >= PERF_EVENT_STATE_INACTIVE)
+ *enabled += delta;
+
+ *running = event->total_time_running;
+ if (state >= PERF_EVENT_STATE_ACTIVE)
+ *running += delta;
+}
+
+static void perf_event_update_time(struct perf_event *event)
+{
+ u64 now = perf_event_time(event);
+
+ __perf_update_times(event, now, &event->total_time_enabled,
+ &event->total_time_running);
+ event->tstamp = now;
+}
+
+static void perf_event_update_sibling_time(struct perf_event *leader)
+{
+ struct perf_event *sibling;
+
+ list_for_each_entry(sibling, &leader->sibling_list, group_entry)
+ perf_event_update_time(sibling);
+}
+
+static void
+perf_event_set_state(struct perf_event *event, enum perf_event_state state)
+{
+ if (event->state == state)
+ return;
+
+ perf_event_update_time(event);
+ /*
+ * If a group leader gets enabled/disabled all its siblings
+ * are affected too.
+ */
+ if ((event->state < 0) ^ (state < 0))
+ perf_event_update_sibling_time(event);
+
+ WRITE_ONCE(event->state, state);
+}
+
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -841,40 +923,6 @@ perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
event->shadow_ctx_time = now - t->timestamp;
}
-static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
- /*
- * when the current task's perf cgroup does not match
- * the event's, we need to remember to call the
- * perf_mark_enable() function the first time a task with
- * a matching perf cgroup is scheduled in.
- */
- if (is_cgroup_event(event) && !perf_cgroup_match(event))
- event->cgrp_defer_enabled = 1;
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- if (!event->cgrp_defer_enabled)
- return;
-
- event->cgrp_defer_enabled = 0;
-
- event->tstamp_enabled = tstamp - event->total_time_enabled;
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
- sub->cgrp_defer_enabled = 0;
- }
- }
-}
-
/*
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
* cleared when last cgroup event is removed.
@@ -975,17 +1023,6 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event)
}
static inline void
-perf_cgroup_defer_enabled(struct perf_event *event)
-{
-}
-
-static inline void
-perf_cgroup_mark_enabled(struct perf_event *event,
- struct perf_event_context *ctx)
-{
-}
-
-static inline void
list_update_cgroup_event(struct perf_event *event,
struct perf_event_context *ctx, bool add)
{
@@ -1006,7 +1043,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
struct perf_cpu_context *cpuctx;
int rotations = 0;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
rotations = perf_rotate_context(cpuctx);
@@ -1093,7 +1130,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(!list_empty(&ctx->active_ctx_list));
@@ -1102,7 +1139,7 @@ static void perf_event_ctx_activate(struct perf_event_context *ctx)
static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
{
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
WARN_ON(list_empty(&ctx->active_ctx_list));
@@ -1202,7 +1239,7 @@ perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
again:
rcu_read_lock();
- ctx = ACCESS_ONCE(event->ctx);
+ ctx = READ_ONCE(event->ctx);
if (!atomic_inc_not_zero(&ctx->refcount)) {
rcu_read_unlock();
goto again;
@@ -1398,60 +1435,6 @@ static u64 perf_event_time(struct perf_event *event)
return ctx ? ctx->time : 0;
}
-/*
- * Update the total_time_enabled and total_time_running fields for a event.
- */
-static void update_event_times(struct perf_event *event)
-{
- struct perf_event_context *ctx = event->ctx;
- u64 run_end;
-
- lockdep_assert_held(&ctx->lock);
-
- if (event->state < PERF_EVENT_STATE_INACTIVE ||
- event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
- return;
-
- /*
- * in cgroup mode, time_enabled represents
- * the time the event was enabled AND active
- * tasks were in the monitored cgroup. This is
- * independent of the activity of the context as
- * there may be a mix of cgroup and non-cgroup events.
- *
- * That is why we treat cgroup events differently
- * here.
- */
- if (is_cgroup_event(event))
- run_end = perf_cgroup_event_time(event);
- else if (ctx->is_active)
- run_end = ctx->time;
- else
- run_end = event->tstamp_stopped;
-
- event->total_time_enabled = run_end - event->tstamp_enabled;
-
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- run_end = event->tstamp_stopped;
- else
- run_end = perf_event_time(event);
-
- event->total_time_running = run_end - event->tstamp_running;
-
-}
-
-/*
- * Update total_time_enabled and total_time_running for all events in a group.
- */
-static void update_group_times(struct perf_event *leader)
-{
- struct perf_event *event;
-
- update_event_times(leader);
- list_for_each_entry(event, &leader->sibling_list, group_entry)
- update_event_times(event);
-}
-
static enum event_type_t get_event_type(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
@@ -1494,6 +1477,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
+ event->tstamp = perf_event_time(event);
+
/*
* If we're a stand alone event or group leader, we go to the context
* list, group events are kept attached to the group so that
@@ -1701,8 +1686,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (event->group_leader == event)
list_del_init(&event->group_entry);
- update_group_times(event);
-
/*
* If event was in error state, then keep it
* that way, otherwise bogus counts will be
@@ -1711,7 +1694,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
* of the event
*/
if (event->state > PERF_EVENT_STATE_OFF)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
ctx->generation++;
}
@@ -1810,38 +1793,24 @@ event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
- u64 delta;
+ enum perf_event_state state = PERF_EVENT_STATE_INACTIVE;
WARN_ON_ONCE(event->ctx != ctx);
lockdep_assert_held(&ctx->lock);
- /*
- * An event which could not be activated because of
- * filter mismatch still needs to have its timings
- * maintained, otherwise bogus information is return
- * via read() for time_enabled, time_running:
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE &&
- !event_filter_match(event)) {
- delta = tstamp - event->tstamp_stopped;
- event->tstamp_running += delta;
- event->tstamp_stopped = tstamp;
- }
-
if (event->state != PERF_EVENT_STATE_ACTIVE)
return;
perf_pmu_disable(event->pmu);
- event->tstamp_stopped = tstamp;
event->pmu->del(event, 0);
event->oncpu = -1;
- event->state = PERF_EVENT_STATE_INACTIVE;
+
if (event->pending_disable) {
event->pending_disable = 0;
- event->state = PERF_EVENT_STATE_OFF;
+ state = PERF_EVENT_STATE_OFF;
}
+ perf_event_set_state(event, state);
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1861,7 +1830,9 @@ group_sched_out(struct perf_event *group_event,
struct perf_event_context *ctx)
{
struct perf_event *event;
- int state = group_event->state;
+
+ if (group_event->state != PERF_EVENT_STATE_ACTIVE)
+ return;
perf_pmu_disable(ctx->pmu);
@@ -1875,7 +1846,7 @@ group_sched_out(struct perf_event *group_event,
perf_pmu_enable(ctx->pmu);
- if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
+ if (group_event->attr.exclusive)
cpuctx->exclusive = 0;
}
@@ -1895,6 +1866,11 @@ __perf_remove_from_context(struct perf_event *event,
{
unsigned long flags = (unsigned long)info;
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
event_sched_out(event, cpuctx, ctx);
if (flags & DETACH_GROUP)
perf_group_detach(event);
@@ -1957,14 +1933,17 @@ static void __perf_event_disable(struct perf_event *event,
if (event->state < PERF_EVENT_STATE_INACTIVE)
return;
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
+ if (ctx->is_active & EVENT_TIME) {
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
+
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
else
event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
+
+ perf_event_set_state(event, PERF_EVENT_STATE_OFF);
}
/*
@@ -2021,8 +2000,7 @@ void perf_event_disable_inatomic(struct perf_event *event)
}
static void perf_set_shadow_time(struct perf_event *event,
- struct perf_event_context *ctx,
- u64 tstamp)
+ struct perf_event_context *ctx)
{
/*
* use the correct time source for the time snapshot
@@ -2050,9 +2028,9 @@ static void perf_set_shadow_time(struct perf_event *event,
* is cleaner and simpler to understand.
*/
if (is_cgroup_event(event))
- perf_cgroup_set_shadow_time(event, tstamp);
+ perf_cgroup_set_shadow_time(event, event->tstamp);
else
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ event->shadow_ctx_time = event->tstamp - ctx->timestamp;
}
#define MAX_INTERRUPTS (~0ULL)
@@ -2065,7 +2043,6 @@ event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
int ret = 0;
lockdep_assert_held(&ctx->lock);
@@ -2075,11 +2052,12 @@ event_sched_in(struct perf_event *event,
WRITE_ONCE(event->oncpu, smp_processor_id());
/*
- * Order event::oncpu write to happen before the ACTIVE state
- * is visible.
+ * Order event::oncpu write to happen before the ACTIVE state is
+ * visible. This allows perf_event_{stop,read}() to observe the correct
+ * ->oncpu if it sees ACTIVE.
*/
smp_wmb();
- WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
+ perf_event_set_state(event, PERF_EVENT_STATE_ACTIVE);
/*
* Unthrottle events, since we scheduled we might have missed several
@@ -2091,26 +2069,19 @@ event_sched_in(struct perf_event *event,
event->hw.interrupts = 0;
}
- /*
- * The new state must be visible before we turn it on in the hardware:
- */
- smp_wmb();
-
perf_pmu_disable(event->pmu);
- perf_set_shadow_time(event, ctx, tstamp);
+ perf_set_shadow_time(event, ctx);
perf_log_itrace_start(event);
if (event->pmu->add(event, PERF_EF_START)) {
- event->state = PERF_EVENT_STATE_INACTIVE;
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
event->oncpu = -1;
ret = -EAGAIN;
goto out;
}
- event->tstamp_running += tstamp - event->tstamp_stopped;
-
if (!is_software_event(event))
cpuctx->active_oncpu++;
if (!ctx->nr_active++)
@@ -2134,8 +2105,6 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = ctx->pmu;
- u64 now = ctx->time;
- bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
@@ -2165,27 +2134,13 @@ group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
- * The events up to the failed event are scheduled out normally,
- * tstamp_stopped will be updated.
- *
- * The failed events and the remaining siblings need to have
- * their timings updated as if they had gone thru event_sched_in()
- * and event_sched_out(). This is required to get consistent timings
- * across the group. This also takes care of the case where the group
- * could never be scheduled by ensuring tstamp_stopped is set to mark
- * the time the event was actually stopped, such that time delta
- * calculation in update_event_times() is correct.
+ * The events up to the failed event are scheduled out normally.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- simulate = true;
+ break;
- if (simulate) {
- event->tstamp_running += now - event->tstamp_stopped;
- event->tstamp_stopped = now;
- } else {
- event_sched_out(event, cpuctx, ctx);
- }
+ event_sched_out(event, cpuctx, ctx);
}
event_sched_out(group_event, cpuctx, ctx);
@@ -2227,46 +2182,11 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
-/*
- * Complement to update_event_times(). This computes the tstamp_* values to
- * continue 'enabled' state from @now, and effectively discards the time
- * between the prior tstamp_stopped and now (as we were in the OFF state, or
- * just switched (context) time base).
- *
- * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
- * cannot have been scheduled in yet. And going into INACTIVE state means
- * '@event->tstamp_stopped = @now'.
- *
- * Thus given the rules of update_event_times():
- *
- * total_time_enabled = tstamp_stopped - tstamp_enabled
- * total_time_running = tstamp_stopped - tstamp_running
- *
- * We can insert 'tstamp_stopped == now' and reverse them to compute new
- * tstamp_* values.
- */
-static void __perf_event_enable_time(struct perf_event *event, u64 now)
-{
- WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
-
- event->tstamp_stopped = now;
- event->tstamp_enabled = now - event->total_time_enabled;
- event->tstamp_running = now - event->total_time_running;
-}
-
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
- u64 tstamp = perf_event_time(event);
-
list_add_event(event, ctx);
perf_group_attach(event);
- /*
- * We can be called with event->state == STATE_OFF when we create with
- * .disabled = 1. In that case the IOC_ENABLE will call this function.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2498,28 +2418,6 @@ again:
}
/*
- * Put a event into inactive state and update time fields.
- * Enabling the leader of a group effectively enables all
- * the group members that aren't explicitly disabled, so we
- * have to update their ->tstamp_enabled also.
- * Note: this works for group members as well as group leaders
- * since the non-leader members' sibling_lists will be empty.
- */
-static void __perf_event_mark_enabled(struct perf_event *event)
-{
- struct perf_event *sub;
- u64 tstamp = perf_event_time(event);
-
- event->state = PERF_EVENT_STATE_INACTIVE;
- __perf_event_enable_time(event, tstamp);
- list_for_each_entry(sub, &event->sibling_list, group_entry) {
- /* XXX should not be > INACTIVE if event isn't */
- if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- __perf_event_enable_time(sub, tstamp);
- }
-}
-
-/*
* Cross CPU call to enable a performance event
*/
static void __perf_event_enable(struct perf_event *event,
@@ -2537,14 +2435,12 @@ static void __perf_event_enable(struct perf_event *event,
if (ctx->is_active)
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
if (!ctx->is_active)
return;
if (!event_filter_match(event)) {
- if (is_cgroup_event(event))
- perf_cgroup_defer_enabled(event);
ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
return;
}
@@ -2864,18 +2760,10 @@ static void __perf_event_sync_stat(struct perf_event *event,
* we know the event must be on the current CPU, therefore we
* don't need to use it.
*/
- switch (event->state) {
- case PERF_EVENT_STATE_ACTIVE:
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
- /* fall-through */
-
- case PERF_EVENT_STATE_INACTIVE:
- update_event_times(event);
- break;
- default:
- break;
- }
+ perf_event_update_time(event);
/*
* In order to keep per-task stats reliable we need to flip the event
@@ -3112,10 +3000,6 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -3123,10 +3007,8 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
* If this pinned group hasn't been scheduled,
* put it in error state.
*/
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_ERROR;
- }
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
}
}
@@ -3148,10 +3030,6 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
- /* may need to reset tstamp_enabled */
- if (is_cgroup_event(event))
- perf_cgroup_mark_enabled(event, ctx);
-
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -3523,7 +3401,7 @@ void perf_event_task_tick(void)
struct perf_event_context *ctx, *tmp;
int throttled;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
@@ -3543,7 +3421,7 @@ static int event_enable_on_exec(struct perf_event *event,
if (event->state >= PERF_EVENT_STATE_INACTIVE)
return 0;
- __perf_event_mark_enabled(event);
+ perf_event_set_state(event, PERF_EVENT_STATE_INACTIVE);
return 1;
}
@@ -3637,12 +3515,15 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
- update_event_times(event);
+ perf_event_update_time(event);
+ if (data->group)
+ perf_event_update_sibling_time(event);
+
if (event->state != PERF_EVENT_STATE_ACTIVE)
goto unlock;
@@ -3657,7 +3538,6 @@ static void __perf_event_read(void *info)
pmu->read(event);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
- update_event_times(sub);
if (sub->state == PERF_EVENT_STATE_ACTIVE) {
/*
* Use sibling's PMU rather than @event's since
@@ -3686,7 +3566,8 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-int perf_event_read_local(struct perf_event *event, u64 *value)
+int perf_event_read_local(struct perf_event *event, u64 *value,
+ u64 *enabled, u64 *running)
{
unsigned long flags;
int ret = 0;
@@ -3720,6 +3601,7 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
goto out;
}
+
/*
* If the event is currently on this CPU, its either a per-task event,
* or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@ -3729,6 +3611,16 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
event->pmu->read(event);
*value = local64_read(&event->count);
+ if (enabled || running) {
+ u64 now = event->shadow_ctx_time + perf_clock();
+ u64 __enabled, __running;
+
+ __perf_update_times(event, now, &__enabled, &__running);
+ if (enabled)
+ *enabled = __enabled;
+ if (running)
+ *running = __running;
+ }
out:
local_irq_restore(flags);
@@ -3737,23 +3629,35 @@ out:
static int perf_event_read(struct perf_event *event, bool group)
{
+ enum perf_event_state state = READ_ONCE(event->state);
int event_cpu, ret = 0;
/*
* If event is enabled and currently active on a CPU, update the
* value in the event structure:
*/
- if (event->state == PERF_EVENT_STATE_ACTIVE) {
- struct perf_read_data data = {
- .event = event,
- .group = group,
- .ret = 0,
- };
+again:
+ if (state == PERF_EVENT_STATE_ACTIVE) {
+ struct perf_read_data data;
+
+ /*
+ * Orders the ->state and ->oncpu loads such that if we see
+ * ACTIVE we must also see the right ->oncpu.
+ *
+ * Matches the smp_wmb() from event_sched_in().
+ */
+ smp_rmb();
event_cpu = READ_ONCE(event->oncpu);
if ((unsigned)event_cpu >= nr_cpu_ids)
return 0;
+ data = (struct perf_read_data){
+ .event = event,
+ .group = group,
+ .ret = 0,
+ };
+
preempt_disable();
event_cpu = __perf_event_read_cpu(event, event_cpu);
@@ -3770,24 +3674,30 @@ static int perf_event_read(struct perf_event *event, bool group)
(void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
preempt_enable();
ret = data.ret;
- } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
+
+ } else if (state == PERF_EVENT_STATE_INACTIVE) {
struct perf_event_context *ctx = event->ctx;
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
+ state = event->state;
+ if (state != PERF_EVENT_STATE_INACTIVE) {
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
+ goto again;
+ }
+
/*
- * may read while context is not active
- * (e.g., thread is blocked), in that case
- * we cannot update context time
+ * May read while context is not active (e.g., thread is
+ * blocked), in that case we cannot update context time
*/
- if (ctx->is_active) {
+ if (ctx->is_active & EVENT_TIME) {
update_context_time(ctx);
update_cgrp_time_from_event(event);
}
+
+ perf_event_update_time(event);
if (group)
- update_group_times(event);
- else
- update_event_times(event);
+ perf_event_update_sibling_time(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -4233,7 +4143,7 @@ static void perf_remove_from_owner(struct perf_event *event)
* indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- owner = lockless_dereference(event->owner);
+ owner = READ_ONCE(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -4330,7 +4240,7 @@ again:
* Cannot change, child events are not migrated, see the
* comment with perf_event_ctx_lock_nested().
*/
- ctx = lockless_dereference(child->ctx);
+ ctx = READ_ONCE(child->ctx);
/*
* Since child_mutex nests inside ctx::mutex, we must jump
* through hoops. We start by grabbing a reference on the ctx.
@@ -4390,7 +4300,7 @@ static int perf_release(struct inode *inode, struct file *file)
return 0;
}
-u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+static u64 __perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
u64 total = 0;
@@ -4418,6 +4328,18 @@ u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
return total;
}
+
+u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
+{
+ struct perf_event_context *ctx;
+ u64 count;
+
+ ctx = perf_event_ctx_lock(event);
+ count = __perf_event_read_value(event, enabled, running);
+ perf_event_ctx_unlock(event, ctx);
+
+ return count;
+}
EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
@@ -4433,6 +4355,8 @@ static int __perf_read_group_add(struct perf_event *leader,
if (ret)
return ret;
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
/*
* Since we co-schedule groups, {enabled,running} times of siblings
* will be identical to those of the leader, so we only publish one
@@ -4455,8 +4379,6 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
- raw_spin_lock_irqsave(&ctx->lock, flags);
-
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
@@ -4520,7 +4442,7 @@ static int perf_read_one(struct perf_event *event,
u64 values[4];
int n = 0;
- values[n++] = perf_event_read_value(event, &enabled, &running);
+ values[n++] = __perf_event_read_value(event, &enabled, &running);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
@@ -4899,8 +4821,7 @@ static void calc_timer_values(struct perf_event *event,
*now = perf_clock();
ctx_time = event->shadow_ctx_time + *now;
- *enabled = ctx_time - event->tstamp_enabled;
- *running = ctx_time - event->tstamp_running;
+ __perf_update_times(event, ctx_time, enabled, running);
}
static void perf_event_init_userpage(struct perf_event *event)
@@ -5304,8 +5225,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
if (!rb)
goto aux_unlock;
- aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
- aux_size = ACCESS_ONCE(rb->user_page->aux_size);
+ aux_offset = READ_ONCE(rb->user_page->aux_offset);
+ aux_size = READ_ONCE(rb->user_page->aux_size);
if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
goto aux_unlock;
@@ -8074,6 +7995,7 @@ static void bpf_overflow_handler(struct perf_event *event,
struct bpf_perf_event_data_kern ctx = {
.data = data,
.regs = regs,
+ .event = event,
};
int ret = 0;
@@ -9404,6 +9326,11 @@ static void account_event(struct perf_event *event)
inc = true;
if (inc) {
+ /*
+ * We need the mutex here because static_branch_enable()
+ * must complete *before* the perf_sched_count increment
+ * becomes visible.
+ */
if (atomic_inc_not_zero(&perf_sched_count))
goto enabled;
@@ -10529,7 +10456,7 @@ perf_event_exit_event(struct perf_event *child_event,
if (parent_event)
perf_group_detach(child_event);
list_del_event(child_event, child_ctx);
- child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ perf_event_set_state(child_event, PERF_EVENT_STATE_EXIT); /* is_event_hup() */
raw_spin_unlock_irq(&child_ctx->lock);
/*
@@ -10767,7 +10694,7 @@ inherit_event(struct perf_event *parent_event,
struct perf_event *group_leader,
struct perf_event_context *child_ctx)
{
- enum perf_event_active_state parent_state = parent_event->state;
+ enum perf_event_state parent_state = parent_event->state;
struct perf_event *child_event;
unsigned long flags;
@@ -11103,6 +11030,7 @@ static void __perf_event_exit_context(void *__info)
struct perf_event *event;
raw_spin_lock(&ctx->lock);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
list_for_each_entry(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
raw_spin_unlock(&ctx->lock);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f684d8e5fa2b..f3e37971c842 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -381,7 +381,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
* (B) <-> (C) ordering is still observed by the pmu driver.
*/
if (!rb->aux_overwrite) {
- aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
+ aux_tail = READ_ONCE(rb->user_page->aux_tail);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..6b4298a41167 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1339,7 +1339,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
- int exit_state = ACCESS_ONCE(p->exit_state);
+ int exit_state = READ_ONCE(p->exit_state);
int ret;
if (unlikely(exit_state == EXIT_DEAD))
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c8c1d073fbf1..e0923fa4927a 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -264,7 +264,7 @@ u64 irq_timings_next_event(u64 now)
* order to prevent the timings circular buffer to be updated
* while we are reading it.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* Number of elements in the circular buffer: If it happens it
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index bcf107ce0854..ec8ac337404d 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -56,7 +56,6 @@ void __weak arch_irq_work_raise(void)
*/
}
-#ifdef CONFIG_SMP
/*
* Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
@@ -68,6 +67,8 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(cpu));
+#ifdef CONFIG_SMP
+
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
@@ -78,10 +79,12 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
arch_send_call_function_single_ipi(cpu);
+#else /* #ifdef CONFIG_SMP */
+ irq_work_queue(work);
+#endif /* #else #ifdef CONFIG_SMP */
+
return true;
}
-EXPORT_SYMBOL_GPL(irq_work_queue_on);
-#endif
/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
@@ -188,7 +191,7 @@ void irq_work_tick(void)
*/
void irq_work_sync(struct irq_work *work)
{
- WARN_ON_ONCE(irqs_disabled());
+ lockdep_assert_irqs_enabled();
while (work->flags & IRQ_WORK_BUSY)
cpu_relax();
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 0bf2e8f5244a..8ff4ca4665ff 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -83,7 +83,7 @@ static void static_key_slow_inc_cpuslocked(struct static_key *key)
{
int v, v1;
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
/*
* Careful if we get concurrent static_key_slow_inc() calls;
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);
void static_key_enable_cpuslocked(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
if (atomic_read(&key->enabled) > 0) {
WARN_ON_ONCE(atomic_read(&key->enabled) != 1);
@@ -158,7 +158,7 @@ EXPORT_SYMBOL_GPL(static_key_enable);
void static_key_disable_cpuslocked(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
if (atomic_read(&key->enabled) != 1) {
WARN_ON_ONCE(atomic_read(&key->enabled) != 0);
@@ -224,21 +224,21 @@ static void jump_label_update_timeout(struct work_struct *work)
void static_key_slow_dec(struct static_key *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(key, 0, NULL);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec);
void static_key_slow_dec_deferred(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
__static_key_slow_dec(&key->key, key->timeout, &key->work);
}
EXPORT_SYMBOL_GPL(static_key_slow_dec_deferred);
void static_key_deferred_flush(struct static_key_deferred *key)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
flush_delayed_work(&key->work);
}
EXPORT_SYMBOL_GPL(static_key_deferred_flush);
@@ -246,7 +246,7 @@ EXPORT_SYMBOL_GPL(static_key_deferred_flush);
void jump_label_rate_limit(struct static_key_deferred *key,
unsigned long rl)
{
- STATIC_KEY_CHECK_USE();
+ STATIC_KEY_CHECK_USE(key);
key->timeout = rl;
INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
}
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 127e7cfafa55..1e6ae66c6244 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -480,6 +480,7 @@ struct kallsym_iter {
char name[KSYM_NAME_LEN];
char module_name[MODULE_NAME_LEN];
int exported;
+ int show_value;
};
static int get_ksymbol_mod(struct kallsym_iter *iter)
@@ -582,12 +583,15 @@ static void s_stop(struct seq_file *m, void *p)
static int s_show(struct seq_file *m, void *p)
{
+ unsigned long value;
struct kallsym_iter *iter = m->private;
/* Some debugging symbols have no name. Ignore them. */
if (!iter->name[0])
return 0;
+ value = iter->show_value ? iter->value : 0;
+
if (iter->module_name[0]) {
char type;
@@ -597,10 +601,10 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
type, iter->name, iter->module_name);
} else
- seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ seq_printf(m, KALLSYM_FMT " %c %s\n", value,
iter->type, iter->name);
return 0;
}
@@ -612,6 +616,40 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+static inline int kallsyms_for_perf(void)
+{
+#ifdef CONFIG_PERF_EVENTS
+ extern int sysctl_perf_event_paranoid;
+ if (sysctl_perf_event_paranoid <= 1)
+ return 1;
+#endif
+ return 0;
+}
+
+/*
+ * We show kallsyms information even to normal users if we've enabled
+ * kernel profiling and are explicitly not paranoid (so kptr_restrict
+ * is clear, and sysctl_perf_event_paranoid isn't set).
+ *
+ * Otherwise, require CAP_SYSLOG (assuming kptr_restrict isn't set to
+ * block even that).
+ */
+int kallsyms_show_value(void)
+{
+ switch (kptr_restrict) {
+ case 0:
+ if (kallsyms_for_perf())
+ return 1;
+ /* fallthrough */
+ case 1:
+ if (has_capability_noaudit(current, CAP_SYSLOG))
+ return 1;
+ /* fallthrough */
+ default:
+ return 0;
+ }
+}
+
static int kallsyms_open(struct inode *inode, struct file *file)
{
/*
@@ -625,6 +663,7 @@ static int kallsyms_open(struct inode *inode, struct file *file)
return -ENOMEM;
reset_iter(iter, 0);
+ iter->show_value = kallsyms_show_value();
return 0;
}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a1606a4224e1..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -117,7 +117,7 @@ enum kprobe_slot_state {
SLOT_USED = 2,
};
-static void *alloc_insn_page(void)
+void __weak *alloc_insn_page(void)
{
return module_alloc(PAGE_SIZE);
}
@@ -573,13 +573,15 @@ static void kprobe_optimizer(struct work_struct *work)
do_unoptimize_kprobes();
/*
- * Step 2: Wait for quiesence period to ensure all running interrupts
- * are done. Because optprobe may modify multiple instructions
- * there is a chance that Nth instruction is interrupted. In that
- * case, running interrupt can return to 2nd-Nth byte of jump
- * instruction. This wait is for avoiding it.
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
*/
- synchronize_sched();
+ synchronize_rcu_tasks();
/* Step 3: Optimize kprobes after quiesence period */
do_optimize_kprobes();
@@ -1769,6 +1771,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
+#if 0
int register_jprobes(struct jprobe **jps, int num)
{
int ret = 0, i;
@@ -1837,6 +1840,7 @@ void unregister_jprobes(struct jprobe **jps, int num)
}
}
EXPORT_SYMBOL_GPL(unregister_jprobes);
+#endif
#ifdef CONFIG_KRETPROBES
/*
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index e36e652d996f..db933d063bfc 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -76,6 +76,19 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK
+static int crossrelease_fullstack = 1;
+#else
+static int crossrelease_fullstack;
+#endif
+static int __init allow_crossrelease_fullstack(char *str)
+{
+ crossrelease_fullstack = 1;
+ return 0;
+}
+
+early_param("crossrelease_fullstack", allow_crossrelease_fullstack);
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -4863,8 +4876,14 @@ static void add_xhlock(struct held_lock *hlock)
xhlock->trace.nr_entries = 0;
xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES;
xhlock->trace.entries = xhlock->trace_entries;
- xhlock->trace.skip = 3;
- save_stack_trace(&xhlock->trace);
+
+ if (crossrelease_fullstack) {
+ xhlock->trace.skip = 3;
+ save_stack_trace(&xhlock->trace);
+ } else {
+ xhlock->trace.nr_entries = 1;
+ xhlock->trace.entries[0] = hlock->acquire_ip;
+ }
}
static inline int same_context_xhlock(struct hist_lock *xhlock)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 2655f26ec882..c7471c3fb798 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,49 +23,11 @@
#include <linux/spinlock.h>
#include <asm/qrwlock.h>
-/*
- * This internal data structure is used for optimizing access to some of
- * the subfields within the atomic_t cnts.
- */
-struct __qrwlock {
- union {
- atomic_t cnts;
- struct {
-#ifdef __LITTLE_ENDIAN
- u8 wmode; /* Writer mode */
- u8 rcnts[3]; /* Reader counts */
-#else
- u8 rcnts[3]; /* Reader counts */
- u8 wmode; /* Writer mode */
-#endif
- };
- };
- arch_spinlock_t lock;
-};
-
-/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax();
- cnts = atomic_read_acquire(&lock->cnts);
- }
-}
-
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
- * @cnts: Current qrwlock lock value
*/
-void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
+void queued_read_lock_slowpath(struct qrwlock *lock)
{
/*
* Readers come here when they cannot get the lock without waiting
@@ -73,13 +35,11 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
if (unlikely(in_interrupt())) {
/*
* Readers in interrupt context will get the lock immediately
- * if the writer is just waiting (not holding the lock yet).
- * The rspin_until_writer_unlock() function returns immediately
- * in this case. Otherwise, they will spin (with ACQUIRE
- * semantics) until the lock is available without waiting in
- * the queue.
+ * if the writer is just waiting (not holding the lock yet),
+ * so spin with ACQUIRE semantics until the lock is available
+ * without waiting in the queue.
*/
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
@@ -88,14 +48,14 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* Put the reader into the wait queue
*/
arch_spin_lock(&lock->wait_lock);
+ atomic_add(_QR_BIAS, &lock->cnts);
/*
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
*/
- cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts);
- rspin_until_writer_unlock(lock, cnts);
+ atomic_cond_read_acquire(&lock->cnts, !(VAL & _QW_LOCKED));
/*
* Signal the next one in queue to become queue head
@@ -110,8 +70,6 @@ EXPORT_SYMBOL(queued_read_lock_slowpath);
*/
void queued_write_lock_slowpath(struct qrwlock *lock)
{
- u32 cnts;
-
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
@@ -120,30 +78,14 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
(atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
- /*
- * Set the waiting flag to notify readers that a writer is pending,
- * or wait for a previous writer to go away.
- */
- for (;;) {
- struct __qrwlock *l = (struct __qrwlock *)lock;
-
- if (!READ_ONCE(l->wmode) &&
- (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
- break;
+ /* Set the waiting flag to notify readers that a writer is pending */
+ atomic_add(_QW_WAITING, &lock->cnts);
- cpu_relax();
- }
-
- /* When no more readers, set the locked flag */
- for (;;) {
- cnts = atomic_read(&lock->cnts);
- if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
- break;
-
- cpu_relax();
- }
+ /* When no more readers or writers, set the locked flag */
+ do {
+ atomic_cond_read_acquire(&lock->cnts, VAL == _QW_WAITING);
+ } while (atomic_cmpxchg_relaxed(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) != _QW_WAITING);
unlock:
arch_spin_unlock(&lock->wait_lock);
}
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 15b6a39366c6..6ee477765e6c 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -61,21 +61,50 @@ struct pv_node {
#include "qspinlock_stat.h"
/*
+ * Hybrid PV queued/unfair lock
+ *
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
- * being queued. By allowing one lock stealing attempt here when the pending
- * bit is off, it helps to reduce the performance impact of lock waiter
- * preemption without the drawback of lock starvation.
+ * being queued.
+ *
+ * The pending bit is set by the queue head vCPU of the MCS wait queue in
+ * pv_wait_head_or_lock() to signal that it is ready to spin on the lock.
+ * When that bit becomes visible to the incoming waiters, no lock stealing
+ * is allowed. The function will return immediately to make the waiters
+ * enter the MCS wait queue. So lock starvation shouldn't happen as long
+ * as the queued mode vCPUs are actively running to set the pending bit
+ * and hence disabling lock stealing.
+ *
+ * When the pending bit isn't set, the lock waiters will stay in the unfair
+ * mode spinning on the lock unless the MCS wait queue is empty. In this
+ * case, the lock waiters will enter the queued mode slowpath trying to
+ * become the queue head and set the pending bit.
+ *
+ * This hybrid PV queued/unfair lock combines the best attributes of a
+ * queued lock (no lock starvation) and an unfair lock (good performance
+ * on not heavily contended locks).
*/
-#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
-static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+#define queued_spin_trylock(l) pv_hybrid_queued_unfair_trylock(l)
+static inline bool pv_hybrid_queued_unfair_trylock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
- if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
- (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
- qstat_inc(qstat_pv_lock_stealing, true);
- return true;
+ /*
+ * Stay in unfair lock mode as long as queued mode waiters are
+ * present in the MCS wait queue but the pending bit isn't set.
+ */
+ for (;;) {
+ int val = atomic_read(&lock->val);
+
+ if (!(val & _Q_LOCKED_PENDING_MASK) &&
+ (cmpxchg_acquire(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
+ qstat_inc(qstat_pv_lock_stealing, true);
+ return true;
+ }
+ if (!(val & _Q_TAIL_MASK) || (val & _Q_PENDING_MASK))
+ break;
+
+ cpu_relax();
}
return false;
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index a6c76a4832b4..f549c552dbf1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -29,6 +29,22 @@ void __sched down_read(struct rw_semaphore *sem)
EXPORT_SYMBOL(down_read);
+int __sched down_read_killable(struct rw_semaphore *sem)
+{
+ might_sleep();
+ rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
+
+ if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) {
+ rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ return -EINTR;
+ }
+
+ rwsem_set_reader_owned(sem);
+ return 0;
+}
+
+EXPORT_SYMBOL(down_read_killable);
+
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
index 6e40fdfba326..1fd1a7543cdd 100644
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -30,11 +30,10 @@
#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
/*
* The __lock_function inlines are taken from
- * include/linux/spinlock_api_smp.h
+ * spinlock : include/linux/spinlock_api_smp.h
+ * rwlock : include/linux/rwlock_api_smp.h
*/
#else
-#define raw_read_can_lock(l) read_can_lock(l)
-#define raw_write_can_lock(l) write_can_lock(l)
/*
* Some architectures can relax in favour of the CPU owning the lock.
@@ -69,7 +68,7 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
@@ -89,7 +88,7 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
\
if (!(lock)->break_lock) \
(lock)->break_lock = 1; \
- while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
+ while ((lock)->break_lock) \
arch_##op##_relax(&lock->raw_lock); \
} \
(lock)->break_lock = 0; \
diff --git a/kernel/module.c b/kernel/module.c
index de66ec825992..32c2cdaccd93 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -278,6 +278,16 @@ static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE);
module_param(sig_enforce, bool_enable_only, 0644);
#endif /* !CONFIG_MODULE_SIG_FORCE */
+/*
+ * Export sig_enforce kernel cmdline parameter to allow other subsystems rely
+ * on that instead of directly to CONFIG_MODULE_SIG_FORCE config.
+ */
+bool is_module_sig_enforced(void)
+{
+ return sig_enforce;
+}
+EXPORT_SYMBOL(is_module_sig_enforced);
+
/* Block module loading/unloading? */
int modules_disabled = 0;
core_param(nomodule, modules_disabled, bint, 0);
@@ -1516,7 +1526,7 @@ static void add_sect_attrs(struct module *mod, const struct load_info *info)
sattr->mattr.show = module_sect_show;
sattr->mattr.store = NULL;
sattr->mattr.attr.name = sattr->name;
- sattr->mattr.attr.mode = S_IRUGO;
+ sattr->mattr.attr.mode = S_IRUSR;
*(gattr++) = &(sattr++)->mattr.attr;
}
*gattr = NULL;
@@ -4147,6 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
{
struct module *mod = list_entry(p, struct module, list);
char buf[MODULE_FLAGS_BUF_SIZE];
+ unsigned long value;
/* We always ignore unformed modules. */
if (mod->state == MODULE_STATE_UNFORMED)
@@ -4162,7 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading" :
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%pK", mod->core_layout.base);
+ value = m->private ? 0 : (unsigned long)mod->core_layout.base;
+ seq_printf(m, " 0x" KALLSYM_FMT, value);
/* Taints info */
if (mod->taints)
@@ -4184,9 +4196,23 @@ static const struct seq_operations modules_op = {
.show = m_show
};
+/*
+ * This also sets the "private" pointer to non-NULL if the
+ * kernel pointers should be hidden (so you can just test
+ * "m->private" to see if you should keep the values private).
+ *
+ * We use the same logic as for /proc/kallsyms.
+ */
static int modules_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &modules_op);
+ int err = seq_open(file, &modules_op);
+
+ if (!err) {
+ struct seq_file *m = file->private_data;
+ m->private = kallsyms_show_value() ? NULL : (void *)8ul;
+ }
+
+ return 0;
}
static const struct file_operations proc_modules_operations = {
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index e4b43fef89f5..59c471de342a 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -203,6 +203,21 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
extern int rcu_cpu_stall_suppress;
int rcu_jiffies_till_stall_check(void);
+#define rcu_ftrace_dump_stall_suppress() \
+do { \
+ if (!rcu_cpu_stall_suppress) \
+ rcu_cpu_stall_suppress = 3; \
+} while (0)
+
+#define rcu_ftrace_dump_stall_unsuppress() \
+do { \
+ if (rcu_cpu_stall_suppress == 3) \
+ rcu_cpu_stall_suppress = 0; \
+} while (0)
+
+#else /* #endif #ifdef CONFIG_RCU_STALL_COMMON */
+#define rcu_ftrace_dump_stall_suppress()
+#define rcu_ftrace_dump_stall_unsuppress()
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
/*
@@ -220,8 +235,12 @@ do { \
static atomic_t ___rfd_beenhere = ATOMIC_INIT(0); \
\
if (!atomic_read(&___rfd_beenhere) && \
- !atomic_xchg(&___rfd_beenhere, 1)) \
+ !atomic_xchg(&___rfd_beenhere, 1)) { \
+ tracing_off(); \
+ rcu_ftrace_dump_stall_suppress(); \
ftrace_dump(oops_dump_mode); \
+ rcu_ftrace_dump_stall_unsuppress(); \
+ } \
} while (0)
void rcu_early_boot_tests(void);
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 7649fcd2c4c7..88cba7c2956c 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -23,6 +23,7 @@
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/interrupt.h>
+#include <linux/rcupdate.h>
#include "rcu_segcblist.h"
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 45f2ffbc1e78..362eb2f78b3c 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include <asm/byteorder.h>
#include <linux/torture.h>
#include <linux/vmalloc.h>
+#include <linux/sched/debug.h>
#include "rcu.h"
@@ -89,6 +90,7 @@ torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
torture_param(int, stall_cpu_holdoff, 10,
"Time to wait before starting stall (s).");
+torture_param(int, stall_cpu_irqsoff, 0, "Disable interrupts while stalling.");
torture_param(int, stat_interval, 60,
"Number of seconds between stats printk()s");
torture_param(int, stutter, 5, "Number of seconds to run/halt test");
@@ -1239,6 +1241,7 @@ rcu_torture_stats_print(void)
long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
static unsigned long rtcv_snap = ULONG_MAX;
+ static bool splatted;
struct task_struct *wtp;
for_each_possible_cpu(cpu) {
@@ -1324,6 +1327,10 @@ rcu_torture_stats_print(void)
gpnum, completed, flags,
wtp == NULL ? ~0UL : wtp->state,
wtp == NULL ? -1 : (int)task_cpu(wtp));
+ if (!splatted && wtp) {
+ sched_show_task(wtp);
+ splatted = true;
+ }
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1357,7 +1364,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
"test_boost=%d/%d test_boost_interval=%d "
"test_boost_duration=%d shutdown_secs=%d "
- "stall_cpu=%d stall_cpu_holdoff=%d "
+ "stall_cpu=%d stall_cpu_holdoff=%d stall_cpu_irqsoff=%d "
"n_barrier_cbs=%d "
"onoff_interval=%d onoff_holdoff=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
@@ -1365,7 +1372,7 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
test_boost, cur_ops->can_boost,
test_boost_interval, test_boost_duration, shutdown_secs,
- stall_cpu, stall_cpu_holdoff,
+ stall_cpu, stall_cpu_holdoff, stall_cpu_irqsoff,
n_barrier_cbs,
onoff_interval, onoff_holdoff);
}
@@ -1430,12 +1437,19 @@ static int rcu_torture_stall(void *args)
if (!kthread_should_stop()) {
stop_at = get_seconds() + stall_cpu;
/* RCU CPU stall is expected behavior in following code. */
- pr_alert("rcu_torture_stall start.\n");
rcu_read_lock();
- preempt_disable();
+ if (stall_cpu_irqsoff)
+ local_irq_disable();
+ else
+ preempt_disable();
+ pr_alert("rcu_torture_stall start on CPU %d.\n",
+ smp_processor_id());
while (ULONG_CMP_LT(get_seconds(), stop_at))
continue; /* Induce RCU CPU stall warning. */
- preempt_enable();
+ if (stall_cpu_irqsoff)
+ local_irq_enable();
+ else
+ preempt_enable();
rcu_read_unlock();
pr_alert("rcu_torture_stall end.\n");
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3e3650e94ae6..f9c0ca2ccf0c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -534,8 +534,8 @@ module_param(rcu_kick_kthreads, bool, 0644);
* How long the grace period must be before we start recruiting
* quiescent-state help from rcu_note_context_switch().
*/
-static ulong jiffies_till_sched_qs = HZ / 20;
-module_param(jiffies_till_sched_qs, ulong, 0644);
+static ulong jiffies_till_sched_qs = HZ / 10;
+module_param(jiffies_till_sched_qs, ulong, 0444);
static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
struct rcu_data *rdp);
@@ -734,7 +734,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
int idx = (READ_ONCE(rnp->completed) + 1) & 0x1;
int *fp = &rnp->need_future_gp[idx];
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_future_needs_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
return READ_ONCE(*fp);
}
@@ -746,7 +746,7 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
static bool
cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "cpu_needs_another_gp() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_gp_in_progress(rsp))
return false; /* No, a grace period is already in progress. */
if (rcu_future_needs_gp(rsp))
@@ -773,7 +773,7 @@ static void rcu_eqs_enter_common(bool user)
struct rcu_data *rdp;
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_enter_common() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0);
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
!user && !is_idle_task(current)) {
@@ -837,10 +837,13 @@ static void rcu_eqs_enter(bool user)
* We crowbar the ->dynticks_nesting field to zero to allow for
* the possibility of usermode upcalls having messed up our count
* of interrupt nesting level during the prior busy period.
+ *
+ * If you add or remove a call to rcu_idle_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_enter(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(false);
}
@@ -852,10 +855,13 @@ void rcu_idle_enter(void)
* is permitted between this call and rcu_user_exit(). This way the
* CPU doesn't need to maintain the tick for RCU maintenance purposes
* when the CPU runs in userspace.
+ *
+ * If you add or remove a call to rcu_user_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_enter(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -875,12 +881,15 @@ void rcu_user_enter(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit(void)
{
struct rcu_dynticks *rdtp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
/* Page faults can happen in NMI handlers, so check... */
@@ -899,6 +908,9 @@ void rcu_irq_exit(void)
/*
* Wrapper for rcu_irq_exit() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_exit_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_exit_irqson(void)
{
@@ -947,7 +959,7 @@ static void rcu_eqs_exit(bool user)
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_eqs_exit() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
oldval = rdtp->dynticks_nesting;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0);
@@ -971,6 +983,9 @@ static void rcu_eqs_exit(bool user)
* allow for the possibility of usermode upcalls messing up our count
* of interrupt nesting level during the busy period that is just
* now starting.
+ *
+ * If you add or remove a call to rcu_idle_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_idle_exit(void)
{
@@ -987,6 +1002,9 @@ void rcu_idle_exit(void)
*
* Exit RCU idle mode while entering the kernel because it can
* run a RCU read side critical section anytime.
+ *
+ * If you add or remove a call to rcu_user_exit(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_user_exit(void)
{
@@ -1012,13 +1030,16 @@ void rcu_user_exit(void)
* Use things like work queues to work around this limitation.
*
* You have been warned.
+ *
+ * If you add or remove a call to rcu_irq_enter(), be sure to test with
+ * CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter(void)
{
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
rdtp = this_cpu_ptr(&rcu_dynticks);
/* Page faults can happen in NMI handlers, so check... */
@@ -1037,6 +1058,9 @@ void rcu_irq_enter(void)
/*
* Wrapper for rcu_irq_enter() where interrupts are enabled.
+ *
+ * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_irq_enter_irqson(void)
{
@@ -1055,6 +1079,9 @@ void rcu_irq_enter_irqson(void)
* that the CPU is active. This implementation permits nested NMIs, as
* long as the nesting level does not overflow an int. (You will probably
* run out of stack space first.)
+ *
+ * If you add or remove a call to rcu_nmi_enter(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_enter(void)
{
@@ -1087,6 +1114,9 @@ void rcu_nmi_enter(void)
* RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting
* to let the RCU grace-period handling know that the CPU is back to
* being RCU-idle.
+ *
+ * If you add or remove a call to rcu_nmi_exit(), be sure to test
+ * with CONFIG_RCU_EQS_DEBUG=y.
*/
void rcu_nmi_exit(void)
{
@@ -1207,6 +1237,22 @@ static int rcu_is_cpu_rrupt_from_idle(void)
}
/*
+ * We are reporting a quiescent state on behalf of some other CPU, so
+ * it is our responsibility to check for and handle potential overflow
+ * of the rcu_node ->gpnum counter with respect to the rcu_data counters.
+ * After all, the CPU might be in deep idle state, and thus executing no
+ * code whatsoever.
+ */
+static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
+{
+ lockdep_assert_held(&rnp->lock);
+ if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rnp->gpnum))
+ WRITE_ONCE(rdp->gpwrap, true);
+ if (ULONG_CMP_LT(rdp->rcu_iw_gpnum + ULONG_MAX / 4, rnp->gpnum))
+ rdp->rcu_iw_gpnum = rnp->gpnum + ULONG_MAX / 4;
+}
+
+/*
* Snapshot the specified CPU's dynticks counter so that we can later
* credit them with an implicit quiescent state. Return 1 if this CPU
* is in dynticks idle mode, which is an extended quiescent state.
@@ -1216,15 +1262,34 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
- if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
- rdp->mynode->gpnum))
- WRITE_ONCE(rdp->gpwrap, true);
+ rcu_gpnum_ovf(rdp->mynode, rdp);
return 1;
}
return 0;
}
/*
+ * Handler for the irq_work request posted when a grace period has
+ * gone on for too long, but not yet long enough for an RCU CPU
+ * stall warning. Set state appropriately, but just complain if
+ * there is unexpected state on entry.
+ */
+static void rcu_iw_handler(struct irq_work *iwp)
+{
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+
+ rdp = container_of(iwp, struct rcu_data, rcu_iw);
+ rnp = rdp->mynode;
+ raw_spin_lock_rcu_node(rnp);
+ if (!WARN_ON_ONCE(!rdp->rcu_iw_pending)) {
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ rdp->rcu_iw_pending = false;
+ }
+ raw_spin_unlock_rcu_node(rnp);
+}
+
+/*
* Return true if the specified CPU has passed through a quiescent
* state by virtue of being in or having passed through an dynticks
* idle state since the last call to dyntick_save_progress_counter()
@@ -1235,8 +1300,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
unsigned long jtsq;
bool *rnhqp;
bool *ruqp;
- unsigned long rjtsc;
- struct rcu_node *rnp;
+ struct rcu_node *rnp = rdp->mynode;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -1249,34 +1313,25 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
rdp->dynticks_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
- /* Compute and saturate jiffies_till_sched_qs. */
- jtsq = jiffies_till_sched_qs;
- rjtsc = rcu_jiffies_till_stall_check();
- if (jtsq > rjtsc / 2) {
- WRITE_ONCE(jiffies_till_sched_qs, rjtsc);
- jtsq = rjtsc / 2;
- } else if (jtsq < 1) {
- WRITE_ONCE(jiffies_till_sched_qs, 1);
- jtsq = 1;
- }
-
/*
* Has this CPU encountered a cond_resched_rcu_qs() since the
* beginning of the grace period? For this to be the case,
* the CPU has to have noticed the current grace period. This
* might not be the case for nohz_full CPUs looping in the kernel.
*/
- rnp = rdp->mynode;
+ jtsq = jiffies_till_sched_qs;
ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu);
if (time_after(jiffies, rdp->rsp->gp_start + jtsq) &&
READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) &&
READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc"));
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
- } else {
+ } else if (time_after(jiffies, rdp->rsp->gp_start + jtsq)) {
/* Load rcu_qs_ctr before store to rcu_urgent_qs. */
smp_store_release(ruqp, true);
}
@@ -1285,6 +1340,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl"));
rdp->offline_fqs++;
+ rcu_gpnum_ovf(rnp, rdp);
return 1;
}
@@ -1304,10 +1360,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* updates are only once every few jiffies, the probability of
* lossage (and thus of slight grace-period extension) is
* quite low.
- *
- * Note that if the jiffies_till_sched_qs boot/sysfs parameter
- * is set too high, we override with half of the RCU CPU stall
- * warning delay.
*/
rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
@@ -1316,15 +1368,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
WRITE_ONCE(*rnhqp, true);
/* Store rcu_need_heavy_qs before rcu_urgent_qs. */
smp_store_release(ruqp, true);
- rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
+ rdp->rsp->jiffies_resched += jtsq; /* Re-enable beating. */
}
/*
- * If more than halfway to RCU CPU stall-warning time, do
- * a resched_cpu() to try to loosen things up a bit.
+ * If more than halfway to RCU CPU stall-warning time, do a
+ * resched_cpu() to try to loosen things up a bit. Also check to
+ * see if the CPU is getting hammered with interrupts, but only
+ * once per grace period, just to keep the IPIs down to a dull roar.
*/
- if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2)
+ if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) {
resched_cpu(rdp->cpu);
+ if (IS_ENABLED(CONFIG_IRQ_WORK) &&
+ !rdp->rcu_iw_pending && rdp->rcu_iw_gpnum != rnp->gpnum &&
+ (rnp->ffmask & rdp->grpmask)) {
+ init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
+ rdp->rcu_iw_pending = true;
+ rdp->rcu_iw_gpnum = rnp->gpnum;
+ irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
+ }
+ }
return 0;
}
@@ -1513,6 +1576,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
{
int cpu;
unsigned long flags;
+ struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
struct rcu_node *rnp = rcu_get_root(rsp);
long totqlen = 0;
@@ -1528,7 +1592,9 @@ static void print_cpu_stall(struct rcu_state *rsp)
*/
pr_err("INFO: %s self-detected stall on CPU", rsp->name);
print_cpu_stall_info_begin();
+ raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
print_cpu_stall_info(rsp, smp_processor_id());
+ raw_spin_unlock_irqrestore_rcu_node(rdp->mynode, flags);
print_cpu_stall_info_end();
for_each_possible_cpu(cpu)
totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda,
@@ -1922,6 +1988,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp,
rdp->core_needs_qs = need_gp;
zero_cpu_stall_ticks(rdp);
WRITE_ONCE(rdp->gpwrap, false);
+ rcu_gpnum_ovf(rnp, rdp);
}
return ret;
}
@@ -3702,6 +3769,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->cpu_no_qs.b.norm = true;
rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
+ rdp->rcu_iw_pending = false;
+ rdp->rcu_iw_gpnum = rnp->gpnum - 1;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
@@ -3739,10 +3808,24 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing)
*/
int rcutree_online_cpu(unsigned int cpu)
{
- sync_sched_exp_online_cleanup(cpu);
- rcutree_affinity_setting(cpu, -1);
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask |= rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_online_cpu(cpu);
+ if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
+ return 0; /* Too early in boot for scheduler work. */
+ sync_sched_exp_online_cleanup(cpu);
+ rcutree_affinity_setting(cpu, -1);
return 0;
}
@@ -3752,6 +3835,19 @@ int rcutree_online_cpu(unsigned int cpu)
*/
int rcutree_offline_cpu(unsigned int cpu)
{
+ unsigned long flags;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp) {
+ rdp = per_cpu_ptr(rsp->rda, cpu);
+ rnp = rdp->mynode;
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rnp->ffmask &= ~rdp->grpmask;
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ }
+
rcutree_affinity_setting(cpu, cpu);
if (IS_ENABLED(CONFIG_TREE_SRCU))
srcu_offline_cpu(cpu);
@@ -4200,8 +4296,7 @@ void __init rcu_init(void)
for_each_online_cpu(cpu) {
rcutree_prepare_cpu(cpu);
rcu_cpu_starting(cpu);
- if (IS_ENABLED(CONFIG_TREE_SRCU))
- srcu_online_cpu(cpu);
+ rcutree_online_cpu(cpu);
}
}
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 8e1f285f0a70..46a5d1991450 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -103,6 +103,7 @@ struct rcu_node {
/* Online CPUs for next expedited GP. */
/* Any CPU that has ever been online will */
/* have its bit set. */
+ unsigned long ffmask; /* Fully functional CPUs. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
/* Only one bit will be set in this mask. */
int grplo; /* lowest-numbered CPU or group here. */
@@ -285,6 +286,10 @@ struct rcu_data {
/* 8) RCU CPU stall data. */
unsigned int softirq_snap; /* Snapshot of softirq activity. */
+ /* ->rcu_iw* fields protected by leaf rcu_node ->lock. */
+ struct irq_work rcu_iw; /* Check for non-irq activity. */
+ bool rcu_iw_pending; /* Is ->rcu_iw pending? */
+ unsigned long rcu_iw_gpnum; /* ->gpnum associated with ->rcu_iw. */
int cpu;
struct rcu_state *rsp;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 34125d23e58a..910405dc6e5c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -55,6 +55,7 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work);
* This probably needs to be excluded from -rt builds.
*/
#define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; })
+#define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1)
#endif /* #else #ifdef CONFIG_RCU_BOOST */
@@ -326,7 +327,7 @@ static void rcu_preempt_note_context_switch(bool preempt)
struct rcu_data *rdp;
struct rcu_node *rnp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_preempt_note_context_switch() invoked with interrupts enabled!!!\n");
+ lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
if (t->rcu_read_lock_nesting > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -531,7 +532,7 @@ void rcu_read_unlock_special(struct task_struct *t)
/* Unboost if we were boosted. */
if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_unlock(&rnp->boost_mtx);
+ rt_mutex_futex_unlock(&rnp->boost_mtx);
/*
* If this was the last task on the expedited lists,
@@ -912,8 +913,6 @@ void exit_rcu(void)
#ifdef CONFIG_RCU_BOOST
-#include "../locking/rtmutex_common.h"
-
static void rcu_wake_cond(struct task_struct *t, int status)
{
/*
@@ -1422,7 +1421,7 @@ int rcu_needs_cpu(u64 basemono, u64 *nextevt)
struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
unsigned long dj;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_needs_cpu() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
@@ -1471,7 +1470,7 @@ static void rcu_prepare_for_idle(void)
struct rcu_state *rsp;
int tne;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_prepare_for_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
@@ -1508,7 +1507,7 @@ static void rcu_prepare_for_idle(void)
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
rdp = this_cpu_ptr(rsp->rda);
- if (rcu_segcblist_pend_cbs(&rdp->cblist))
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist))
continue;
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
@@ -1526,7 +1525,7 @@ static void rcu_prepare_for_idle(void)
*/
static void rcu_cleanup_after_idle(void)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_cleanup_after_idle() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
@@ -1672,6 +1671,7 @@ static void print_cpu_stall_info_begin(void)
*/
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
{
+ unsigned long delta;
char fast_no_hz[72];
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_dynticks *rdtp = rdp->dynticks;
@@ -1686,11 +1686,15 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
ticks_value = rsp->gpnum - rdp->gpnum;
}
print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
- pr_err("\t%d-%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
+ delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum;
+ pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
cpu,
"O."[!!cpu_online(cpu)],
"o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)],
"N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)],
+ !IS_ENABLED(CONFIG_IRQ_WORK) ? '?' :
+ rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
+ "!."[!delta],
ticks_value, ticks_title,
rcu_dynticks_snap(rdtp) & 0xfff,
rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
@@ -2013,7 +2017,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
+ lockdep_assert_irqs_disabled();
if (!rcu_is_nocb_cpu(smp_processor_id()))
return false; /* Not NOCBs CPU, caller must migrate CBs. */
__call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e3e60efaafee..fbd56d6e575b 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -495,6 +495,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#endif
int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
+EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress);
static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
module_param(rcu_cpu_stall_suppress, int, 0644);
@@ -576,7 +577,6 @@ DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT;
module_param(rcu_task_stall_timeout, int, 0644);
-static void rcu_spawn_tasks_kthread(void);
static struct task_struct *rcu_tasks_kthread_ptr;
/**
@@ -601,7 +601,6 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
{
unsigned long flags;
bool needwake;
- bool havetask = READ_ONCE(rcu_tasks_kthread_ptr);
rhp->next = NULL;
rhp->func = func;
@@ -611,11 +610,8 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func)
rcu_tasks_cbs_tail = &rhp->next;
raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
/* We can't create the thread unless interrupts are enabled. */
- if ((needwake && havetask) ||
- (!havetask && !irqs_disabled_flags(flags))) {
- rcu_spawn_tasks_kthread();
+ if (needwake && READ_ONCE(rcu_tasks_kthread_ptr))
wake_up(&rcu_tasks_cbs_wq);
- }
}
EXPORT_SYMBOL_GPL(call_rcu_tasks);
@@ -854,27 +850,18 @@ static int __noreturn rcu_tasks_kthread(void *arg)
}
}
-/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
-static void rcu_spawn_tasks_kthread(void)
+/* Spawn rcu_tasks_kthread() at core_initcall() time. */
+static int __init rcu_spawn_tasks_kthread(void)
{
- static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
struct task_struct *t;
- if (READ_ONCE(rcu_tasks_kthread_ptr)) {
- smp_mb(); /* Ensure caller sees full kthread. */
- return;
- }
- mutex_lock(&rcu_tasks_kthread_mutex);
- if (rcu_tasks_kthread_ptr) {
- mutex_unlock(&rcu_tasks_kthread_mutex);
- return;
- }
t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
BUG_ON(IS_ERR(t));
smp_mb(); /* Ensure others see full kthread. */
WRITE_ONCE(rcu_tasks_kthread_ptr, t);
- mutex_unlock(&rcu_tasks_kthread_mutex);
+ return 0;
}
+core_initcall(rcu_spawn_tasks_kthread);
/* Do the srcu_read_lock() for the above synchronize_srcu(). */
void exit_tasks_rcu_start(void)
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..e086babe6c61 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -388,7 +388,7 @@ void sched_clock_tick(void)
if (unlikely(!sched_clock_running))
return;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
scd = this_scd();
__scd_stamp(scd);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a39a08113003..5b82a0073532 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -506,8 +506,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -4851,6 +4850,7 @@ int __sched _cond_resched(void)
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
@@ -5174,6 +5174,7 @@ void sched_show_task(struct task_struct *p)
show_stack(p, NULL);
put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9209d83ecdcf..ba0da243fdd8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -649,6 +649,7 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
@@ -714,11 +715,6 @@ struct cpufreq_governor *cpufreq_default_governor(void)
static int __init sugov_register(void)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(sugov_cpu, cpu).cpu = cpu;
-
return cpufreq_register_governor(&schedutil_gov);
}
fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..9be8b68a66da 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max)
{
u64 accounted;
- /* Shall be converted to a lockdep-enabled lightweight check */
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
accounted = steal_account_process_time(max);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 418a1c045933..5f0dfb2abb8d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -190,7 +190,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
u32 ret = SECCOMP_RET_ALLOW;
/* Make sure cross-thread synced filter points somewhere sane. */
struct seccomp_filter *f =
- lockless_dereference(current->seccomp.filter);
+ READ_ONCE(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
diff --git a/kernel/smp.c b/kernel/smp.c
index c94dd85c8d41..084c8b3a2681 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -213,7 +213,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
call_single_data_t *csd, *csd_next;
static bool warned;
- WARN_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
head = this_cpu_ptr(&call_single_queue);
entry = llist_del_all(head);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 4e09821f9d9e..662f7b1b7a78 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -137,7 +137,7 @@ EXPORT_SYMBOL(__local_bh_disable_ip);
static void __local_bh_enable(unsigned int cnt)
{
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (softirq_count() == (cnt & SOFTIRQ_MASK))
trace_softirqs_on(_RET_IP_);
@@ -158,7 +158,8 @@ EXPORT_SYMBOL(_local_bh_enable);
void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
{
- WARN_ON_ONCE(in_irq() || irqs_disabled());
+ WARN_ON_ONCE(in_irq());
+ lockdep_assert_irqs_enabled();
#ifdef CONFIG_TRACE_IRQFLAGS
local_irq_disable();
#endif
@@ -396,9 +397,8 @@ void irq_exit(void)
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
#endif
-
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
@@ -488,7 +488,7 @@ EXPORT_SYMBOL(__tasklet_hi_schedule);
void __tasklet_hi_schedule_first(struct tasklet_struct *t)
{
- BUG_ON(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
t->next = __this_cpu_read(tasklet_hi_vec.head);
__this_cpu_write(tasklet_hi_vec.head, t);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 5718b3ea202a..0fef395662a6 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -68,7 +68,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
* we raced with task_work_run(), *pprev == NULL/exited.
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
- while ((work = lockless_dereference(*pprev))) {
+ while ((work = READ_ONCE(*pprev))) {
if (work->func != func)
pprev = &work->next;
else if (cmpxchg(pprev, work, work->next) == work)
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 0dbab6d1acb4..dd53e354f630 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -22,7 +22,7 @@
#define div_factor 3
-static u32 rand1, preh_val, posth_val, jph_val;
+static u32 rand1, preh_val, posth_val;
static int errors, handler_errors, num_tests;
static u32 (*target)(u32 value);
static u32 (*target2)(u32 value);
@@ -34,6 +34,10 @@ static noinline u32 kprobe_target(u32 value)
static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("pre-handler is preemptible\n");
+ }
preh_val = (rand1 / div_factor);
return 0;
}
@@ -41,6 +45,10 @@ static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("post-handler is preemptible\n");
+ }
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
pr_err("incorrect value in post_handler\n");
@@ -154,8 +162,15 @@ static int test_kprobes(void)
}
+#if 0
+static u32 jph_val;
+
static u32 j_kprobe_target(u32 value)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("jprobe-handler is preemptible\n");
+ }
if (value != rand1) {
handler_errors++;
pr_err("incorrect value in jprobe handler\n");
@@ -227,11 +242,19 @@ static int test_jprobes(void)
return 0;
}
+#else
+#define test_jprobe() (0)
+#define test_jprobes() (0)
+#endif
#ifdef CONFIG_KRETPROBES
static u32 krph_val;
static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("kretprobe entry handler is preemptible\n");
+ }
krph_val = (rand1 / div_factor);
return 0;
}
@@ -240,6 +263,10 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
{
unsigned long ret = regs_return_value(regs);
+ if (preemptible()) {
+ handler_errors++;
+ pr_err("kretprobe return handler is preemptible\n");
+ }
if (ret != (rand1 / div_factor)) {
handler_errors++;
pr_err("incorrect value in kretprobe handler\n");
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88f75f92ef36..d32520840fde 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,7 @@ void clock_was_set(void)
*/
void hrtimers_resume(void)
{
- WARN_ONCE(!irqs_disabled(),
- KERN_INFO "hrtimers_resume() called with IRQs enabled!");
-
+ lockdep_assert_irqs_disabled();
/* Retrigger on the local CPU */
retrigger_next_event(NULL);
/* And schedule a retrigger for all others */
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 5b117110b55b..1f27887aa194 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -603,7 +603,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
/*
* Disarm any old timer after extracting its expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
ret = 0;
old_incr = timer->it.cpu.incr;
@@ -1034,7 +1034,7 @@ static void posix_cpu_timer_rearm(struct k_itimer *timer)
/*
* Now re-arm for the new expiry time.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
arm_timer(timer);
unlock:
unlock_task_sighand(p, &flags);
@@ -1125,7 +1125,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
struct k_itimer *timer, *next;
unsigned long flags;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
/*
* The fast path checks that there are no expired thread or thread
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 69f3dbe38984..99578f06c8d4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -198,7 +198,7 @@ static bool check_tick_dependency(atomic_t *dep)
static bool can_stop_full_tick(int cpu, struct tick_sched *ts)
{
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
if (unlikely(!cpu_online(cpu)))
return false;
@@ -937,8 +937,7 @@ void tick_nohz_idle_enter(void)
{
struct tick_sched *ts;
- WARN_ON_ONCE(irqs_disabled());
-
+ lockdep_assert_irqs_enabled();
/*
* Update the idle state in the scheduler domain hierarchy
* when tick_nohz_stop_sched_tick() is called from the idle loop.
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index dc498b605d5d..95888ae6c263 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -275,7 +275,7 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- err = perf_event_read_local(ee->event, &value);
+ err = perf_event_read_local(ee->event, &value, NULL, NULL);
/*
* this api is ugly since we miss [-22..-2] range of valid
* counter values, but that's uapi
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 81279c6602ff..845f3805c73d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2724,7 +2724,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
* if it happened, we have to fail the write.
*/
barrier();
- if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ if (unlikely(READ_ONCE(cpu_buffer->buffer) != buffer)) {
local_dec(&cpu_buffer->committing);
local_dec(&cpu_buffer->commits);
return NULL;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 401b0639116f..6b0b343a36a2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1460,7 +1460,7 @@ extern struct trace_event_file *find_event_file(struct trace_array *tr,
static inline void *event_file_data(struct file *filp)
{
- return ACCESS_ONCE(file_inode(filp)->i_private);
+ return READ_ONCE(file_inode(filp)->i_private);
}
extern struct mutex event_mutex;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 719a52a4064a..734accc02418 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -78,7 +78,7 @@ check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
- int frame_size = ACCESS_ONCE(tracer_frame);
+ int frame_size = READ_ONCE(tracer_frame);
int i, x;
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b..d32b45662fb6 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -894,7 +894,7 @@ static bool new_idmap_permitted(const struct file *file,
int proc_setgroups_show(struct seq_file *seq, void *v)
{
struct user_namespace *ns = seq->private;
- unsigned long userns_flags = ACCESS_ONCE(ns->flags);
+ unsigned long userns_flags = READ_ONCE(ns->flags);
seq_printf(seq, "%s\n",
(userns_flags & USERNS_SETGROUPS_ALLOWED) ?
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a2dccfe1acec..13f67b5a0a0c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1376,7 +1376,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
debug_work_activate(work);
@@ -2491,15 +2491,8 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
- /*
- * Explicitly init the crosslock for wq_barrier::done, make its lock
- * key a subkey of the corresponding work. As a result we won't
- * build a dependency between wq_barrier::done and unrelated work.
- */
- lockdep_init_map_crosslock((struct lockdep_map *)&barr->done.map,
- "(complete)wq_barr::done",
- target->lockdep_map.key, 1);
- __init_completion(&barr->done);
+ init_completion_map(&barr->done, &target->lockdep_map);
+
barr->task = current;
/*
@@ -2605,16 +2598,13 @@ void flush_workqueue(struct workqueue_struct *wq)
struct wq_flusher this_flusher = {
.list = LIST_HEAD_INIT(this_flusher.list),
.flush_color = -1,
- .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done),
+ .done = COMPLETION_INITIALIZER_ONSTACK_MAP(this_flusher.done, wq->lockdep_map),
};
int next_color;
if (WARN_ON(!wq_online))
return;
- lock_map_acquire(&wq->lockdep_map);
- lock_map_release(&wq->lockdep_map);
-
mutex_lock(&wq->mutex);
/*
@@ -2877,9 +2867,6 @@ bool flush_work(struct work_struct *work)
if (WARN_ON(!wq_online))
return false;
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
-
if (start_flush_work(work, &barr)) {
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
@@ -4640,7 +4627,7 @@ static void rebind_workers(struct worker_pool *pool)
* concurrency management. Note that when or whether
* @worker clears REBOUND doesn't affect correctness.
*
- * ACCESS_ONCE() is necessary because @worker->flags may be
+ * WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
* wq_worker_waking_up(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
@@ -4649,7 +4636,7 @@ static void rebind_workers(struct worker_pool *pool)
WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
worker_flags |= WORKER_REBOUND;
worker_flags &= ~WORKER_UNBOUND;
- ACCESS_ONCE(worker->flags) = worker_flags;
+ WRITE_ONCE(worker->flags, worker_flags);
}
spin_unlock_irq(&pool->lock);