summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt42
-rw-r--r--kernel/audit_fsnotify.c3
-rw-r--r--kernel/audit_watch.c3
-rw-r--r--kernel/bpf/cgroup.c54
-rw-r--r--kernel/bpf/core.c7
-rw-r--r--kernel/bpf/verifier.c55
-rw-r--r--kernel/cgroup/cgroup-v1.c17
-rw-r--r--kernel/cgroup/cgroup.c120
-rw-r--r--kernel/cgroup/cpuset.c23
-rw-r--r--kernel/cgroup/misc.c31
-rw-r--r--kernel/cgroup/rstat.c2
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/debug/kdb/kdb_bt.c16
-rw-r--r--kernel/debug/kdb/kdb_main.c37
-rw-r--r--kernel/debug/kdb/kdb_private.h4
-rw-r--r--kernel/debug/kdb/kdb_support.c118
-rw-r--r--kernel/dma/coherent.c5
-rw-r--r--kernel/dma/swiotlb.c15
-rw-r--r--kernel/entry/syscall_user_dispatch.c12
-rw-r--r--kernel/events/core.c10
-rw-r--r--kernel/exit.c76
-rw-r--r--kernel/extable.c35
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/irq/irqdomain.c7
-rw-r--r--kernel/irq/msi.c4
-rw-r--r--kernel/kcov.c36
-rw-r--r--kernel/kcsan/core.c75
-rw-r--r--kernel/kcsan/kcsan.h8
-rw-r--r--kernel/kcsan/kcsan_test.c62
-rw-r--r--kernel/kcsan/report.c77
-rw-r--r--kernel/kcsan/selftest.c72
-rw-r--r--kernel/kexec_file.c5
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/lockdep.c18
-rw-r--r--kernel/module.c79
-rw-r--r--kernel/pid.c36
-rw-r--r--kernel/power/energy_model.c86
-rw-r--r--kernel/power/hibernate.c12
-rw-r--r--kernel/power/power.h14
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/suspend.c18
-rw-r--r--kernel/power/swap.c21
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c54
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/core.c53
-rw-r--r--kernel/sched/core_sched.c4
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h3
-rw-r--r--kernel/sched/topology.c1
-rw-r--r--kernel/signal.c83
-rw-r--r--kernel/stacktrace.c30
-rw-r--r--kernel/time/posix-cpu-timers.c19
-rw-r--r--kernel/trace/ftrace.c5
-rw-r--r--kernel/trace/ring_buffer.c5
-rw-r--r--kernel/trace/trace.c11
-rw-r--r--kernel/trace/trace_events_hist.c122
-rw-r--r--kernel/trace/trace_osnoise.c616
-rw-r--r--kernel/tsacct.c2
-rw-r--r--kernel/ucount.c20
-rw-r--r--kernel/workqueue.c189
63 files changed, 1633 insertions, 946 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 60f1bfc3c7b2..ce77f0265660 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,12 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
choice
prompt "Preemption Model"
- default PREEMPT_NONE_BEHAVIOUR
+ default PREEMPT_NONE
-config PREEMPT_NONE_BEHAVIOUR
+config PREEMPT_NONE
bool "No Forced Preemption (Server)"
- select PREEMPT_NONE if !PREEMPT_DYNAMIC
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -18,10 +29,10 @@ config PREEMPT_NONE_BEHAVIOUR
raw processing power of the kernel, irrespective of scheduling
latencies.
-config PREEMPT_VOLUNTARY_BEHAVIOUR
+config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -37,10 +48,10 @@ config PREEMPT_VOLUNTARY_BEHAVIOUR
Select this if you are building a kernel for a desktop system.
-config PREEMPT_BEHAVIOUR
+config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT
+ select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -58,7 +69,7 @@ config PREEMPT_BEHAVIOUR
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
- depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC
+ depends on EXPERT && ARCH_SUPPORTS_RT
select PREEMPTION
help
This option turns the kernel into a real-time kernel by replacing
@@ -75,17 +86,6 @@ config PREEMPT_RT
endchoice
-config PREEMPT_NONE
- bool
-
-config PREEMPT_VOLUNTARY
- bool
-
-config PREEMPT
- bool
- select PREEMPTION
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
-
config PREEMPT_COUNT
bool
@@ -95,8 +95,8 @@ config PREEMPTION
config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
- depends on HAVE_PREEMPT_DYNAMIC
- select PREEMPT
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select PREEMPT_BUILD
default y
help
This option allows to define the preemption model on the kernel
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
index 60739d5e3373..02348b48447c 100644
--- a/kernel/audit_fsnotify.c
+++ b/kernel/audit_fsnotify.c
@@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
- if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
- WARN_ON_ONCE(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 698b62b4a2ec..713b256be944 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -473,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
parent = container_of(inode_mark, struct audit_parent, mark);
- if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
- WARN_ON_ONCE(!inode))
+ if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 03145d45e3d5..2ca643af9a54 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -430,10 +430,10 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type, u32 flags)
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type, u32 flags)
{
u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
struct bpf_prog *old_prog = NULL;
@@ -523,6 +523,20 @@ cleanup:
return err;
}
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+ struct bpf_prog *prog, struct bpf_prog *replace_prog,
+ struct bpf_cgroup_link *link,
+ enum bpf_attach_type type,
+ u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
/* Swap updated BPF program for given link in effective program arrays across
* all descendant cgroups. This function is guaranteed to succeed.
*/
@@ -672,14 +686,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to detach or NULL
- * @prog: A link to detach or NULL
+ * @link: A link to detach or NULL
* @type: Type of detach operation
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_cgroup_link *link, enum bpf_attach_type type)
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
@@ -730,9 +744,20 @@ cleanup:
return err;
}
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
/* Must be called with cgroup_mutex held to avoid races. */
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
@@ -789,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 327e3996eadb..2405e39d800f 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -390,6 +390,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
i = end_new;
insn = prog->insnsi + end_old;
}
+ if (bpf_pseudo_func(insn)) {
+ ret = bpf_adj_delta_to_imm(insn, pos, end_old,
+ end_new, i, probe_pass);
+ if (ret)
+ return ret;
+ continue;
+ }
code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP &&
BPF_CLASS(code) != BPF_JMP32) ||
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f0dca726ebfd..890b3ec375a3 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -240,12 +240,6 @@ static bool bpf_pseudo_kfunc_call(const struct bpf_insn *insn)
insn->src_reg == BPF_PSEUDO_KFUNC_CALL;
}
-static bool bpf_pseudo_func(const struct bpf_insn *insn)
-{
- return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
- insn->src_reg == BPF_PSEUDO_FUNC;
-}
-
struct bpf_call_arg_meta {
struct bpf_map *map_ptr;
bool raw_mode;
@@ -1960,16 +1954,10 @@ static int add_subprog_and_kfunc(struct bpf_verifier_env *env)
return -EPERM;
}
- if (bpf_pseudo_func(insn)) {
- ret = add_subprog(env, i + insn->imm + 1);
- if (ret >= 0)
- /* remember subprog */
- insn[1].imm = ret;
- } else if (bpf_pseudo_call(insn)) {
+ if (bpf_pseudo_func(insn) || bpf_pseudo_call(insn))
ret = add_subprog(env, i + insn->imm + 1);
- } else {
+ else
ret = add_kfunc_call(env, insn->imm, insn->off);
- }
if (ret < 0)
return ret;
@@ -3088,9 +3076,12 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
reg = &reg_state->stack[spi].spilled_ptr;
if (is_spilled_reg(&reg_state->stack[spi])) {
- if (size != BPF_REG_SIZE) {
- u8 scalar_size = 0;
+ u8 spill_size = 1;
+
+ for (i = BPF_REG_SIZE - 1; i > 0 && stype[i - 1] == STACK_SPILL; i--)
+ spill_size++;
+ if (size != BPF_REG_SIZE || spill_size != BPF_REG_SIZE) {
if (reg->type != SCALAR_VALUE) {
verbose_linfo(env, env->insn_idx, "; ");
verbose(env, "invalid size of register fill\n");
@@ -3101,10 +3092,7 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
if (dst_regno < 0)
return 0;
- for (i = BPF_REG_SIZE; i > 0 && stype[i - 1] == STACK_SPILL; i--)
- scalar_size++;
-
- if (!(off % BPF_REG_SIZE) && size == scalar_size) {
+ if (!(off % BPF_REG_SIZE) && size == spill_size) {
/* The earlier check_reg_arg() has decided the
* subreg_def for this insn. Save it first.
*/
@@ -3128,12 +3116,6 @@ static int check_stack_read_fixed_off(struct bpf_verifier_env *env,
state->regs[dst_regno].live |= REG_LIVE_WRITTEN;
return 0;
}
- for (i = 1; i < BPF_REG_SIZE; i++) {
- if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
- verbose(env, "corrupted spill memory\n");
- return -EACCES;
- }
- }
if (dst_regno >= 0) {
/* restore register state from stack */
@@ -9393,7 +9375,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (insn->src_reg == BPF_PSEUDO_FUNC) {
struct bpf_prog_aux *aux = env->prog->aux;
- u32 subprogno = insn[1].imm;
+ u32 subprogno = find_subprog(env,
+ env->insn_idx + insn->imm + 1);
if (!aux->func_info) {
verbose(env, "missing btf func_info\n");
@@ -12563,14 +12546,9 @@ static int jit_subprogs(struct bpf_verifier_env *env)
return 0;
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
- if (bpf_pseudo_func(insn)) {
- env->insn_aux_data[i].call_imm = insn->imm;
- /* subprog is encoded in insn[1].imm */
+ if (!bpf_pseudo_func(insn) && !bpf_pseudo_call(insn))
continue;
- }
- if (!bpf_pseudo_call(insn))
- continue;
/* Upon error here we cannot fall back to interpreter but
* need a hard reject of the program. Thus -EFAULT is
* propagated in any case.
@@ -12591,6 +12569,12 @@ static int jit_subprogs(struct bpf_verifier_env *env)
env->insn_aux_data[i].call_imm = insn->imm;
/* point imm to __bpf_call_base+1 from JITs point of view */
insn->imm = 1;
+ if (bpf_pseudo_func(insn))
+ /* jit (e.g. x86_64) may emit fewer instructions
+ * if it learns a u32 imm is the same as a u64 imm.
+ * Force a non zero here.
+ */
+ insn[1].imm = 1;
}
err = bpf_prog_alloc_jited_linfo(prog);
@@ -12675,7 +12659,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
insn = func[i]->insnsi;
for (j = 0; j < func[i]->len; j++, insn++) {
if (bpf_pseudo_func(insn)) {
- subprog = insn[1].imm;
+ subprog = insn->off;
insn[0].imm = (u32)(long)func[subprog]->bpf_func;
insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
continue;
@@ -12726,7 +12710,8 @@ static int jit_subprogs(struct bpf_verifier_env *env)
for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
if (bpf_pseudo_func(insn)) {
insn[0].imm = env->insn_aux_data[i].call_imm;
- insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ insn[1].imm = insn->off;
+ insn->off = 0;
continue;
}
if (!bpf_pseudo_call(insn))
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 35b920328344..81c9e0685948 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
for_each_root(root) {
struct cgroup *from_cgrp;
- if (root == &cgrp_dfl_root)
- continue;
-
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
@@ -662,11 +659,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
+ * Grab the subsystems state racily. No need to add avenue to
+ * cgroup_mutex contention.
*/
- mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
@@ -674,7 +669,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
- mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -701,8 +695,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
- mutex_lock(&cgroup_mutex);
-
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
@@ -710,9 +702,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
+ if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
return -ENOENT;
}
rcu_read_unlock();
@@ -740,7 +731,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
css_task_iter_end(&it);
- mutex_unlock(&cgroup_mutex);
+ cgroup_put(cgrp);
return 0;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ea08f01d0111..919194de39c8 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1740,6 +1740,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
+ u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1756,8 +1757,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
} while_each_subsys_mask();
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
@@ -1766,10 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */
- src_root->subsys_mask &= ~(1 << ssid);
- WARN_ON(cgroup_apply_control(scgrp));
- cgroup_finalize_control(scgrp, 0);
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -5911,17 +5934,20 @@ struct cgroup *cgroup_get_from_id(u64 id)
struct kernfs_node *kn;
struct cgroup *cgrp = NULL;
- mutex_lock(&cgroup_mutex);
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
- goto out_unlock;
+ goto out;
- cgrp = kn->priv;
- if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
cgrp = NULL;
+
+ rcu_read_unlock();
+
kernfs_put(kn);
-out_unlock:
- mutex_unlock(&cgroup_mutex);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6474,30 +6500,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
- struct cgroup *cgrp;
-
- mutex_lock(&cgroup_mutex);
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
- if (kn) {
- if (kernfs_type(kn) == KERNFS_DIR) {
- cgrp = kn->priv;
- cgroup_get_live(cgrp);
- } else {
- cgrp = ERR_PTR(-ENOTDIR);
- }
- kernfs_put(kn);
- } else {
- cgrp = ERR_PTR(-ENOENT);
+ if (!kn)
+ goto out;
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
}
- mutex_unlock(&cgroup_mutex);
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
@@ -6625,44 +6655,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp,
- struct bpf_prog *prog, struct bpf_prog *replace_prog,
- struct bpf_cgroup_link *link,
- enum bpf_attach_type type,
- u32 flags)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
-
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2a9695ccb65f..d0e163a02099 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -69,6 +69,13 @@
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgement
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -1870,6 +1888,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ check_insane_mems_config(&trialcs->mems_allowed);
+
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@@ -3173,6 +3193,9 @@ update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
+
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index ec02d963cad1..fe3e8a0eb7ed 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
new_usage = atomic_long_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
- if (!res->failed) {
- pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
- misc_res_name[type]);
- pr_cont_cgroup_path(i->css.cgroup);
- pr_cont("\n");
- res->failed = true;
- }
ret = -EBUSY;
goto err_charge;
}
@@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
return 0;
err_charge:
+ for (j = i; j; j = parent_misc(j)) {
+ atomic_long_inc(&j->res[type].events);
+ cgroup_file_notify(&j->events_file);
+ }
+
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
misc_cg_cancel_charge(type, i, amount);
@@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ unsigned long events, i;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ events = atomic_long_read(&cg->res[i].events);
+ if (READ_ONCE(misc_res_capacity[i]) || events)
+ seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ }
+ return 0;
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
},
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_file),
+ .seq_show = misc_events_show,
+ },
{}
};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index b264ab5652ba..1486768f2318 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -433,8 +433,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
- cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
}
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ae0b4948a5a..473d17c431f3 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -676,15 +676,14 @@ int set_cred_ucounts(struct cred *new)
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
- if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+ if (old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
return 0;
if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
return -EAGAIN;
new->ucounts = new_ucounts;
- if (old_ucounts)
- put_ucounts(old_ucounts);
+ put_ucounts(old_ucounts);
return 0;
}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 1f9f0e47aeda..10b454554ab0 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -46,7 +46,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
* btp <pid> Kernel stack for <pid>
* btt <address-expression> Kernel stack for task structure at
* <address-expression>
- * bta [DRSTCZEUIMA] All useful processes, optionally
+ * bta [state_chars>|A] All useful processes, optionally
* filtered by state
* btc [<cpu>] The current process on one cpu,
* default is all cpus
@@ -74,7 +74,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr)
*/
static int
-kdb_bt1(struct task_struct *p, unsigned long mask, bool btaprompt)
+kdb_bt1(struct task_struct *p, const char *mask, bool btaprompt)
{
char ch;
@@ -120,7 +120,7 @@ kdb_bt_cpu(unsigned long cpu)
return;
}
- kdb_bt1(kdb_tsk, ~0UL, false);
+ kdb_bt1(kdb_tsk, "A", false);
}
int
@@ -138,8 +138,8 @@ kdb_bt(int argc, const char **argv)
if (strcmp(argv[0], "bta") == 0) {
struct task_struct *g, *p;
unsigned long cpu;
- unsigned long mask = kdb_task_state_string(argc ? argv[1] :
- NULL);
+ const char *mask = argc ? argv[1] : kdbgetenv("PS");
+
if (argc == 0)
kdb_ps_suppressed();
/* Run the active tasks first */
@@ -167,7 +167,7 @@ kdb_bt(int argc, const char **argv)
return diag;
p = find_task_by_pid_ns(pid, &init_pid_ns);
if (p)
- return kdb_bt1(p, ~0UL, false);
+ return kdb_bt1(p, "A", false);
kdb_printf("No process with pid == %ld found\n", pid);
return 0;
} else if (strcmp(argv[0], "btt") == 0) {
@@ -176,7 +176,7 @@ kdb_bt(int argc, const char **argv)
diag = kdbgetularg((char *)argv[1], &addr);
if (diag)
return diag;
- return kdb_bt1((struct task_struct *)addr, ~0UL, false);
+ return kdb_bt1((struct task_struct *)addr, "A", false);
} else if (strcmp(argv[0], "btc") == 0) {
unsigned long cpu = ~0;
if (argc > 1)
@@ -212,7 +212,7 @@ kdb_bt(int argc, const char **argv)
kdb_show_stack(kdb_current_task, (void *)addr);
return 0;
} else {
- return kdb_bt1(kdb_current_task, ~0UL, false);
+ return kdb_bt1(kdb_current_task, "A", false);
}
}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index fa6deda894a1..0852a537dad4 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -2203,8 +2203,8 @@ static void kdb_cpu_status(void)
state = 'D'; /* cpu is online but unresponsive */
} else {
state = ' '; /* cpu is responding to kdb */
- if (kdb_task_state_char(KDB_TSK(i)) == 'I')
- state = 'I'; /* idle task */
+ if (kdb_task_state_char(KDB_TSK(i)) == '-')
+ state = '-'; /* idle task */
}
if (state != prev_state) {
if (prev_state != '?') {
@@ -2271,37 +2271,30 @@ static int kdb_cpu(int argc, const char **argv)
void kdb_ps_suppressed(void)
{
int idle = 0, daemon = 0;
- unsigned long mask_I = kdb_task_state_string("I"),
- mask_M = kdb_task_state_string("M");
unsigned long cpu;
const struct task_struct *p, *g;
for_each_online_cpu(cpu) {
p = kdb_curr_task(cpu);
- if (kdb_task_state(p, mask_I))
+ if (kdb_task_state(p, "-"))
++idle;
}
for_each_process_thread(g, p) {
- if (kdb_task_state(p, mask_M))
+ if (kdb_task_state(p, "ims"))
++daemon;
}
if (idle || daemon) {
if (idle)
- kdb_printf("%d idle process%s (state I)%s\n",
+ kdb_printf("%d idle process%s (state -)%s\n",
idle, idle == 1 ? "" : "es",
daemon ? " and " : "");
if (daemon)
- kdb_printf("%d sleeping system daemon (state M) "
+ kdb_printf("%d sleeping system daemon (state [ims]) "
"process%s", daemon,
daemon == 1 ? "" : "es");
kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
}
}
-/*
- * kdb_ps - This function implements the 'ps' command which shows a
- * list of the active processes.
- * ps [DRSTCZEUIMA] All processes, optionally filtered by state
- */
void kdb_ps1(const struct task_struct *p)
{
int cpu;
@@ -2330,17 +2323,25 @@ void kdb_ps1(const struct task_struct *p)
}
}
+/*
+ * kdb_ps - This function implements the 'ps' command which shows a
+ * list of the active processes.
+ *
+ * ps [<state_chars>] Show processes, optionally selecting only those whose
+ * state character is found in <state_chars>.
+ */
static int kdb_ps(int argc, const char **argv)
{
struct task_struct *g, *p;
- unsigned long mask, cpu;
+ const char *mask;
+ unsigned long cpu;
if (argc == 0)
kdb_ps_suppressed();
kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
(int)(2*sizeof(void *))+2, "Task Addr",
(int)(2*sizeof(void *))+2, "Thread");
- mask = kdb_task_state_string(argc ? argv[1] : NULL);
+ mask = argc ? argv[1] : kdbgetenv("PS");
/* Run the active tasks first */
for_each_online_cpu(cpu) {
if (KDB_FLAG(CMD_INTERRUPT))
@@ -2742,8 +2743,8 @@ static kdbtab_t maintab[] = {
},
{ .name = "bta",
.func = kdb_bt,
- .usage = "[D|R|S|T|C|Z|E|U|I|M|A]",
- .help = "Backtrace all processes matching state flag",
+ .usage = "[<state_chars>|A]",
+ .help = "Backtrace all processes whose state matches",
.flags = KDB_ENABLE_INSPECT,
},
{ .name = "btc",
@@ -2797,7 +2798,7 @@ static kdbtab_t maintab[] = {
},
{ .name = "ps",
.func = kdb_ps,
- .usage = "[<flags>|A]",
+ .usage = "[<state_chars>|A]",
.help = "Display active task list",
.flags = KDB_ENABLE_INSPECT,
},
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 629590084a0d..0d2f9feea0a4 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -190,10 +190,8 @@ extern char kdb_grep_string[];
extern int kdb_grep_leading;
extern int kdb_grep_trailing;
extern char *kdb_cmds[];
-extern unsigned long kdb_task_state_string(const char *);
extern char kdb_task_state_char (const struct task_struct *);
-extern unsigned long kdb_task_state(const struct task_struct *p,
- unsigned long mask);
+extern bool kdb_task_state(const struct task_struct *p, const char *mask);
extern void kdb_ps_suppressed(void);
extern void kdb_ps1(const struct task_struct *p);
extern void kdb_send_sig(struct task_struct *p, int sig);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 7507d9a8dc6a..df2bface866e 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -24,6 +24,7 @@
#include <linux/uaccess.h>
#include <linux/kdb.h>
#include <linux/slab.h>
+#include <linux/ctype.h>
#include "kdb_private.h"
/*
@@ -473,82 +474,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
return diag;
}
-/*
- * kdb_task_state_string - Convert a string containing any of the
- * letters DRSTCZEUIMA to a mask for the process state field and
- * return the value. If no argument is supplied, return the mask
- * that corresponds to environment variable PS, DRSTCZEU by
- * default.
- * Inputs:
- * s String to convert
- * Returns:
- * Mask for process state.
- * Notes:
- * The mask folds data from several sources into a single long value, so
- * be careful not to overlap the bits. TASK_* bits are in the LSB,
- * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
- * is no overlap between TASK_* and EXIT_* but that may not always be
- * true, so EXIT_* bits are shifted left 16 bits before being stored in
- * the mask.
- */
-
-/* unrunnable is < 0 */
-#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
-#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
-#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
-#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
-unsigned long kdb_task_state_string(const char *s)
-{
- long res = 0;
- if (!s) {
- s = kdbgetenv("PS");
- if (!s)
- s = "DRSTCZEU"; /* default value for ps */
- }
- while (*s) {
- switch (*s) {
- case 'D':
- res |= TASK_UNINTERRUPTIBLE;
- break;
- case 'R':
- res |= RUNNING;
- break;
- case 'S':
- res |= TASK_INTERRUPTIBLE;
- break;
- case 'T':
- res |= TASK_STOPPED;
- break;
- case 'C':
- res |= TASK_TRACED;
- break;
- case 'Z':
- res |= EXIT_ZOMBIE << 16;
- break;
- case 'E':
- res |= EXIT_DEAD << 16;
- break;
- case 'U':
- res |= UNRUNNABLE;
- break;
- case 'I':
- res |= IDLE;
- break;
- case 'M':
- res |= DAEMON;
- break;
- case 'A':
- res = ~0UL;
- break;
- default:
- kdb_func_printf("unknown flag '%c' ignored\n", *s);
- break;
- }
- ++s;
- }
- return res;
-}
/*
* kdb_task_state_char - Return the character that represents the task state.
@@ -559,7 +485,6 @@ unsigned long kdb_task_state_string(const char *s)
*/
char kdb_task_state_char (const struct task_struct *p)
{
- unsigned int p_state;
unsigned long tmp;
char state;
int cpu;
@@ -568,25 +493,18 @@ char kdb_task_state_char (const struct task_struct *p)
copy_from_kernel_nofault(&tmp, (char *)p, sizeof(unsigned long)))
return 'E';
- cpu = kdb_process_cpu(p);
- p_state = READ_ONCE(p->__state);
- state = (p_state == 0) ? 'R' :
- (p_state < 0) ? 'U' :
- (p_state & TASK_UNINTERRUPTIBLE) ? 'D' :
- (p_state & TASK_STOPPED) ? 'T' :
- (p_state & TASK_TRACED) ? 'C' :
- (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
- (p->exit_state & EXIT_DEAD) ? 'E' :
- (p_state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+ state = task_state_to_char((struct task_struct *) p);
+
if (is_idle_task(p)) {
/* Idle task. Is it really idle, apart from the kdb
* interrupt? */
+ cpu = kdb_process_cpu(p);
if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
if (cpu != kdb_initial_cpu)
- state = 'I'; /* idle task */
+ state = '-'; /* idle task */
}
- } else if (!p->mm && state == 'S') {
- state = 'M'; /* sleeping system daemon */
+ } else if (!p->mm && strchr("IMS", state)) {
+ state = tolower(state); /* sleeping system daemon */
}
return state;
}
@@ -596,14 +514,28 @@ char kdb_task_state_char (const struct task_struct *p)
* given by the mask.
* Inputs:
* p struct task for the process
- * mask mask from kdb_task_state_string to select processes
+ * mask set of characters used to select processes; both NULL
+ * and the empty string mean adopt a default filter, which
+ * is to suppress sleeping system daemons and the idle tasks
* Returns:
* True if the process matches at least one criteria defined by the mask.
*/
-unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
+bool kdb_task_state(const struct task_struct *p, const char *mask)
{
- char state[] = { kdb_task_state_char(p), '\0' };
- return (mask & kdb_task_state_string(state)) != 0;
+ char state = kdb_task_state_char(p);
+
+ /* If there is no mask, then we will filter code that runs when the
+ * scheduler is idling and any system daemons that are currently
+ * sleeping.
+ */
+ if (!mask || mask[0] == '\0')
+ return !strchr("-ims", state);
+
+ /* A is a special case that matches all states */
+ if (strchr(mask, 'A'))
+ return true;
+
+ return strchr(mask, state);
}
/* Maintain a small stack of kdb_flags to allow recursion without disturbing
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index 25fc85a7aebe..375fb3c9538d 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -40,7 +40,6 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
{
struct dma_coherent_mem *dma_mem;
int pages = size >> PAGE_SHIFT;
- int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
void *mem_base;
if (!size)
@@ -53,7 +52,7 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
if (!dma_mem)
goto out_unmap_membase;
- dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
if (!dma_mem->bitmap)
goto out_free_dma_mem;
@@ -81,7 +80,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
return;
memunmap(mem->virt_base);
- kfree(mem->bitmap);
+ bitmap_free(mem->bitmap);
kfree(mem);
}
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c4ca040fdb05..8e840fbbed7c 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -247,7 +247,7 @@ swiotlb_init(int verbose)
return;
fail_free_mem:
- memblock_free_early(__pa(tlb), bytes);
+ memblock_free(tlb, bytes);
fail:
pr_warn("Cannot allocate buffer");
}
@@ -459,7 +459,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
* allocate a buffer from that IO TLB pool.
*/
static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
- size_t alloc_size)
+ size_t alloc_size, unsigned int alloc_align_mask)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
@@ -483,6 +483,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
if (alloc_size >= PAGE_SIZE)
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
+ stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
spin_lock_irqsave(&mem->lock, flags);
if (unlikely(nslots > mem->nslabs - mem->used))
@@ -541,7 +542,8 @@ found:
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
- enum dma_data_direction dir, unsigned long attrs)
+ unsigned int alloc_align_mask, enum dma_data_direction dir,
+ unsigned long attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
@@ -561,7 +563,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
- index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
+ index = swiotlb_find_slots(dev, orig_addr,
+ alloc_size + offset, alloc_align_mask);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@@ -675,7 +678,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
swiotlb_force);
- swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
+ swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
@@ -759,7 +762,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (!mem)
return NULL;
- index = swiotlb_find_slots(dev, 0, size);
+ index = swiotlb_find_slots(dev, 0, size, 0);
if (index == -1)
return NULL;
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index c240302f56e2..4508201847d2 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -47,14 +47,18 @@ bool syscall_user_dispatch(struct pt_regs *regs)
* access_ok() is performed once, at prctl time, when
* the selector is loaded by userspace.
*/
- if (unlikely(__get_user(state, sd->selector)))
- do_exit(SIGSEGV);
+ if (unlikely(__get_user(state, sd->selector))) {
+ force_fatal_sig(SIGSEGV);
+ return true;
+ }
if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW))
return false;
- if (state != SYSCALL_DISPATCH_FILTER_BLOCK)
- do_exit(SIGSYS);
+ if (state != SYSCALL_DISPATCH_FILTER_BLOCK) {
+ force_fatal_sig(SIGSYS);
+ return true;
+ }
}
sd->on_dispatch = true;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f2253ea729a2..523106a506ee 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7154,7 +7154,6 @@ void perf_output_sample(struct perf_output_handle *handle,
static u64 perf_virt_to_phys(u64 virt)
{
u64 phys_addr = 0;
- struct page *p = NULL;
if (!virt)
return 0;
@@ -7173,14 +7172,15 @@ static u64 perf_virt_to_phys(u64 virt)
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
+ struct page *p;
+
pagefault_disable();
- if (get_user_page_fast_only(virt, 0, &p))
+ if (get_user_page_fast_only(virt, 0, &p)) {
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ put_page(p);
+ }
pagefault_enable();
}
-
- if (p)
- put_page(p);
}
return phys_addr;
diff --git a/kernel/exit.c b/kernel/exit.c
index 50f1692c732d..f702a6a63686 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -340,6 +340,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
+static void coredump_task_exit(struct task_struct *tsk)
+{
+ struct core_state *core_state;
+
+ /*
+ * Serialize with any possible pending coredump.
+ * We must hold siglock around checking core_state
+ * and setting PF_POSTCOREDUMP. The core-inducing thread
+ * will increment ->nr_threads for each thread in the
+ * group without PF_POSTCOREDUMP set.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ tsk->flags |= PF_POSTCOREDUMP;
+ core_state = tsk->signal->core_state;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (core_state) {
+ struct core_thread self;
+
+ self.task = current;
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
+ /*
+ * Implies mb(), the result of xchg() must be visible
+ * to core_state->dumper.
+ */
+ if (atomic_dec_and_test(&core_state->nr_threads))
+ complete(&core_state->startup);
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!self.task) /* see coredump_finish() */
+ break;
+ freezable_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ }
+}
+
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
@@ -435,47 +475,12 @@ assign_new_owner:
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
- struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
- /*
- * Serialize with any possible pending coredump.
- * We must hold mmap_lock around checking core_state
- * and clearing tsk->mm. The core-inducing thread
- * will increment ->nr_threads for each thread in the
- * group with ->mm != NULL.
- */
mmap_read_lock(mm);
- core_state = mm->core_state;
- if (core_state) {
- struct core_thread self;
-
- mmap_read_unlock(mm);
-
- self.task = current;
- if (self.task->flags & PF_SIGNALED)
- self.next = xchg(&core_state->dumper.next, &self);
- else
- self.task = NULL;
- /*
- * Implies mb(), the result of xchg() must be visible
- * to core_state->dumper.
- */
- if (atomic_dec_and_test(&core_state->nr_threads))
- complete(&core_state->startup);
-
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (!self.task) /* see coredump_finish() */
- break;
- freezable_schedule();
- }
- __set_current_state(TASK_RUNNING);
- mmap_read_lock(mm);
- }
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
@@ -763,6 +768,7 @@ void __noreturn do_exit(long code)
profile_task_exit(tsk);
kcov_task_exit(tsk);
+ coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
diff --git a/kernel/extable.c b/kernel/extable.c
index b0ea5eb0c3b4..b6f330f0fe74 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -62,40 +62,13 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
-int init_kernel_text(unsigned long addr)
-{
- if (addr >= (unsigned long)_sinittext &&
- addr < (unsigned long)_einittext)
- return 1;
- return 0;
-}
-
int notrace core_kernel_text(unsigned long addr)
{
- if (addr >= (unsigned long)_stext &&
- addr < (unsigned long)_etext)
+ if (is_kernel_text(addr))
return 1;
- if (system_state < SYSTEM_RUNNING &&
- init_kernel_text(addr))
- return 1;
- return 0;
-}
-
-/**
- * core_kernel_data - tell if addr points to kernel data
- * @addr: address to test
- *
- * Returns true if @addr passed in is from the core kernel data
- * section.
- *
- * Note: On some archs it may return true for core RODATA, and false
- * for others. But will always be true for core RW data.
- */
-int core_kernel_data(unsigned long addr)
-{
- if (addr >= (unsigned long)_sdata &&
- addr < (unsigned long)_edata)
+ if (system_state < SYSTEM_FREEING_INITMEM &&
+ is_kernel_inittext(addr))
return 1;
return 0;
}
@@ -112,7 +85,7 @@ int __kernel_text_address(unsigned long addr)
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
- if (init_kernel_text(addr))
+ if (is_kernel_inittext(addr))
return 1;
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 8e9feeef555e..3244cc56b697 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1043,7 +1043,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
- mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1391,8 +1390,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* purposes.
*/
if (tsk->clear_child_tid) {
- if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
- atomic_read(&mm->mm_users) > 1) {
+ if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
@@ -2279,6 +2277,7 @@ static __latent_entropy struct task_struct *copy_process(
p->pdeath_signal = 0;
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
+ clear_posix_cputimers_work(p);
#ifdef CONFIG_KRETPROBES
p->kretprobe_instances.first = NULL;
@@ -3026,7 +3025,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
- struct files_struct *fd, *new_fd = NULL;
+ struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
@@ -3113,11 +3112,8 @@ int ksys_unshare(unsigned long unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_fd) {
- fd = current->files;
- current->files = new_fd;
- new_fd = fd;
- }
+ if (new_fd)
+ swap(current->files, new_fd);
task_unlock(current);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 4d8fc65cf38f..bf38c546aa25 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -744,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
-static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
- unsigned int count,
- struct irq_fwspec *fwspec)
+void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
+ unsigned int count, struct irq_fwspec *fwspec)
{
int i;
@@ -756,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
for (i = 0; i < count; i++)
fwspec->param[i] = args[i];
}
+EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
@@ -1502,6 +1502,7 @@ out_free_desc:
irq_free_descs(virq, nr_irqs);
return ret;
}
+EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 6a5ecee6e567..7f350ae59c5f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -529,10 +529,10 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
/*
* Checking the first MSI descriptor is sufficient. MSIX supports
- * masking and MSI does so when the maskbit is set.
+ * masking and MSI does so when the can_mask attribute is set.
*/
desc = first_msi_entry(dev);
- return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
+ return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask;
}
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 80bfe71bbe13..36ca640c4f8e 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
struct kcov_percpu_data {
void *irq_area;
+ local_lock_t lock;
unsigned int saved_mode;
unsigned int saved_size;
@@ -96,7 +97,9 @@ struct kcov_percpu_data {
int saved_sequence;
};
-static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
+static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
/*
* Check that kcov_remote_start() is not called twice in background
@@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle)
*/
mode = READ_ONCE(t->kcov_mode);
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle)
* happened while collecting coverage from a background thread.
*/
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
kcov_debug("handle = %llx, context: %s\n", handle,
@@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle)
size = CONFIG_KCOV_IRQ_AREA_SIZE;
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
}
- spin_unlock_irqrestore(&kcov_remote_lock, flags);
+ spin_unlock(&kcov_remote_lock);
/* Can only happen when in_task(). */
if (!area) {
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
kcov_put(kcov);
return;
}
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
}
- local_irq_save(flags);
-
/* Reset coverage size. */
*(u64 *)area = 0;
@@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle)
}
kcov_start(t, kcov, size, area, mode, sequence);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@@ -965,12 +969,12 @@ void kcov_remote_stop(void)
if (!in_task() && !in_serving_softirq())
return;
- local_irq_save(flags);
+ local_lock_irqsave(&kcov_percpu_data.lock, flags);
mode = READ_ONCE(t->kcov_mode);
barrier();
if (!kcov_mode_enabled(mode)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@@ -978,12 +982,12 @@ void kcov_remote_stop(void)
* actually found the remote handle and started collecting coverage.
*/
if (in_serving_softirq() && !t->kcov_softirq) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
@@ -1013,7 +1017,7 @@ void kcov_remote_stop(void)
spin_unlock(&kcov_remote_lock);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
/* Get in kcov_remote_start(). */
kcov_put(kcov);
@@ -1034,8 +1038,8 @@ static int __init kcov_init(void)
int cpu;
for_each_possible_cpu(cpu) {
- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
- sizeof(unsigned long));
+ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
+ sizeof(unsigned long), cpu_to_node(cpu));
if (!area)
return -ENOMEM;
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 76e67d1e02d4..4b84c8e7884b 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -202,6 +202,9 @@ static __always_inline struct kcsan_ctx *get_ctx(void)
return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
}
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
+
/* Check scoped accesses; never inline because this is a slow-path! */
static noinline void kcsan_check_scoped_accesses(void)
{
@@ -210,14 +213,16 @@ static noinline void kcsan_check_scoped_accesses(void)
struct kcsan_scoped_access *scoped_access;
ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
- list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
- __kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
+ list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
+ check_access(scoped_access->ptr, scoped_access->size,
+ scoped_access->type, scoped_access->ip);
+ }
ctx->scoped_accesses.prev = prev_save;
}
/* Rules for generic atomic accesses. Called from fast-path. */
static __always_inline bool
-is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
if (type & KCSAN_ACCESS_ATOMIC)
return true;
@@ -254,7 +259,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
}
static __always_inline bool
-should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
+should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
/*
* Never set up watchpoints when memory operations are atomic.
@@ -263,7 +268,7 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
* should not count towards skipped instructions, and (2) to actually
* decrement kcsan_atomic_next for consecutive instruction stream.
*/
- if (is_atomic(ptr, size, type, ctx))
+ if (is_atomic(ctx, ptr, size, type))
return false;
if (this_cpu_dec_return(kcsan_skip) >= 0)
@@ -350,6 +355,7 @@ void kcsan_restore_irqtrace(struct task_struct *task)
static noinline void kcsan_found_watchpoint(const volatile void *ptr,
size_t size,
int type,
+ unsigned long ip,
atomic_long_t *watchpoint,
long encoded_watchpoint)
{
@@ -396,7 +402,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
if (consumed) {
kcsan_save_irqtrace(current);
- kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
+ kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@@ -416,7 +422,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
}
static noinline void
-kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
+kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
@@ -568,8 +574,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- kcsan_report_known_origin(ptr, size, type, value_change,
- watchpoint - watchpoints,
+ kcsan_report_known_origin(ptr, size, type, ip,
+ value_change, watchpoint - watchpoints,
old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
@@ -578,8 +584,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
- if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
- kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
+ if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
+ kcsan_report_unknown_origin(ptr, size, type, ip,
+ old, new, access_mask);
+ }
}
/*
@@ -596,8 +604,8 @@ out:
user_access_restore(ua_flags);
}
-static __always_inline void check_access(const volatile void *ptr, size_t size,
- int type)
+static __always_inline void
+check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
@@ -625,13 +633,12 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
*/
if (unlikely(watchpoint != NULL))
- kcsan_found_watchpoint(ptr, size, type, watchpoint,
- encoded_watchpoint);
+ kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
- if (unlikely(should_watch(ptr, size, type, ctx)))
- kcsan_setup_watchpoint(ptr, size, type);
+ if (unlikely(should_watch(ctx, ptr, size, type)))
+ kcsan_setup_watchpoint(ptr, size, type, ip);
else if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
@@ -757,7 +764,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
{
struct kcsan_ctx *ctx = get_ctx();
- __kcsan_check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
@@ -765,6 +772,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
sa->ptr = ptr;
sa->size = size;
sa->type = type;
+ sa->ip = _RET_IP_;
if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
INIT_LIST_HEAD(&ctx->scoped_accesses);
@@ -796,13 +804,13 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
ctx->disable_count--;
- __kcsan_check_access(sa->ptr, sa->size, sa->type);
+ check_access(sa->ptr, sa->size, sa->type, sa->ip);
}
EXPORT_SYMBOL(kcsan_end_scoped_access);
void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
{
- check_access(ptr, size, type);
+ check_access(ptr, size, type, _RET_IP_);
}
EXPORT_SYMBOL(__kcsan_check_access);
@@ -823,7 +831,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read##size(void *ptr); \
void __tsan_read##size(void *ptr) \
{ \
- check_access(ptr, size, 0); \
+ check_access(ptr, size, 0, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read##size); \
void __tsan_unaligned_read##size(void *ptr) \
@@ -832,7 +840,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
{ \
- check_access(ptr, size, KCSAN_ACCESS_WRITE); \
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
@@ -842,7 +850,8 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read_write##size(void *ptr) \
{ \
check_access(ptr, size, \
- KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read_write##size); \
void __tsan_unaligned_read_write##size(void *ptr) \
@@ -858,14 +867,14 @@ DEFINE_TSAN_READ_WRITE(16);
void __tsan_read_range(void *ptr, size_t size);
void __tsan_read_range(void *ptr, size_t size)
{
- check_access(ptr, size, 0);
+ check_access(ptr, size, 0, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_read_range);
void __tsan_write_range(void *ptr, size_t size);
void __tsan_write_range(void *ptr, size_t size)
{
- check_access(ptr, size, KCSAN_ACCESS_WRITE);
+ check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_write_range);
@@ -886,7 +895,8 @@ EXPORT_SYMBOL(__tsan_write_range);
IS_ALIGNED((unsigned long)ptr, size); \
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
return; \
- check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
+ check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_read##size); \
void __tsan_unaligned_volatile_read##size(void *ptr) \
@@ -901,7 +911,8 @@ EXPORT_SYMBOL(__tsan_write_range);
return; \
check_access(ptr, size, \
KCSAN_ACCESS_WRITE | \
- (is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
+ (is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
+ _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_write##size); \
void __tsan_unaligned_volatile_write##size(void *ptr) \
@@ -955,7 +966,7 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
{ \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
- check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_load_n(ptr, memorder); \
} \
@@ -965,7 +976,7 @@ EXPORT_SYMBOL(__tsan_init);
{ \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
- KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_store_n(ptr, v, memorder); \
} \
@@ -978,7 +989,7 @@ EXPORT_SYMBOL(__tsan_init);
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_##op##suffix(ptr, v, memorder); \
} \
@@ -1010,7 +1021,7 @@ EXPORT_SYMBOL(__tsan_init);
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
} \
@@ -1025,7 +1036,7 @@ EXPORT_SYMBOL(__tsan_init);
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
- KCSAN_ACCESS_ATOMIC); \
+ KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
return exp; \
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index f36e25c497ed..ae33c2a7f07e 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -121,7 +121,7 @@ enum kcsan_value_change {
* to be consumed by the reporting thread. No report is printed yet.
*/
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
- int watchpoint_idx);
+ unsigned long ip, int watchpoint_idx);
/*
* The calling thread observed that the watchpoint it set up was hit and
@@ -129,14 +129,14 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
* thread.
*/
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change, int watchpoint_idx,
- u64 old, u64 new, u64 mask);
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask);
/*
* No other thread was observed to race with the access, but the data value
* before and after the stall differs. Reports a race of "unknown origin".
*/
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
- u64 old, u64 new, u64 mask);
+ unsigned long ip, u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index dc55fd5a36fc..660729238588 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -29,6 +29,11 @@
#include <linux/types.h>
#include <trace/events/printk.h>
+#define KCSAN_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
#else
@@ -205,10 +210,12 @@ static bool report_matches(const struct expect_report *r)
"read-write" :
"write") :
"read");
+ const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
+ const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
const char *const access_type_aux =
- (ty & KCSAN_ACCESS_ATOMIC) ?
- " (marked)" :
- ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
+ (is_atomic && is_scoped) ? " (marked, scoped)"
+ : (is_atomic ? " (marked)"
+ : (is_scoped ? " (scoped)" : ""));
if (i == 1) {
/* Access 2 */
@@ -333,7 +340,10 @@ static noinline void test_kernel_assert_bits_nochange(void)
ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
}
-/* To check that scoped assertions do trigger anywhere in scope. */
+/*
+ * Scoped assertions do trigger anywhere in scope. However, the report should
+ * still only point at the start of the scope.
+ */
static noinline void test_enter_scope(void)
{
int x = 0;
@@ -488,17 +498,24 @@ static void test_concurrent_races(struct kunit *test)
__no_kcsan
static void test_novalue_change(struct kunit *test)
{
- const struct expect_report expect = {
+ const struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
+ const struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
bool match_expect = false;
+ test_kernel_write_nochange(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange, test_kernel_read);
do {
- match_expect = report_matches(&expect);
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
KUNIT_EXPECT_FALSE(test, match_expect);
@@ -513,17 +530,24 @@ static void test_novalue_change(struct kunit *test)
__no_kcsan
static void test_novalue_change_exception(struct kunit *test)
{
- const struct expect_report expect = {
+ const struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
+ const struct expect_report expect_ww = {
+ .access = {
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ { test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
+ },
+ };
bool match_expect = false;
+ test_kernel_write_nochange_rcu(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
do {
- match_expect = report_matches(&expect);
+ match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
KUNIT_EXPECT_TRUE(test, match_expect);
}
@@ -642,8 +666,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
};
bool match_expect = false;
- if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
- return;
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_write_atomic);
do {
@@ -665,8 +688,7 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
};
bool match_expect = false;
- if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
- return;
+ KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
do {
@@ -828,22 +850,22 @@ static void test_assert_exclusive_writer_scoped(struct kunit *test)
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
- const struct expect_report expect_anywhere = {
+ const struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
bool match_expect_start = false;
- bool match_expect_anywhere = false;
+ bool match_expect_inscope = false;
begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
do {
match_expect_start |= report_matches(&expect_start);
- match_expect_anywhere |= report_matches(&expect_anywhere);
- } while (!end_test_checks(match_expect_start && match_expect_anywhere));
+ match_expect_inscope |= report_matches(&expect_inscope);
+ } while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
- KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
__no_kcsan
@@ -872,9 +894,9 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
do {
match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
match_expect_inscope |= report_matches(&expect_inscope);
- } while (!end_test_checks(match_expect_start && match_expect_inscope));
+ } while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
- KUNIT_EXPECT_TRUE(test, match_expect_inscope);
+ KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
/*
@@ -1224,7 +1246,7 @@ static void kcsan_test_exit(void)
tracepoint_synchronize_unregister();
}
-late_initcall(kcsan_test_init);
+late_initcall_sync(kcsan_test_init);
module_exit(kcsan_test_exit);
MODULE_LICENSE("GPL v2");
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 21137929d428..fc15077991c4 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -8,6 +8,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
+#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
@@ -31,6 +32,7 @@ struct access_info {
int access_type;
int task_pid;
int cpu_id;
+ unsigned long ip;
};
/*
@@ -245,6 +247,10 @@ static const char *get_access_type(int type)
return "write (scoped)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "write (marked, scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write (scoped)";
+ case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked, scoped)";
default:
BUG();
}
@@ -300,6 +306,48 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
return skip;
}
+/*
+ * Skips to the first entry that matches the function of @ip, and then replaces
+ * that entry with @ip, returning the entries to skip.
+ */
+static int
+replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip)
+{
+ unsigned long symbolsize, offset;
+ unsigned long target_func;
+ int skip;
+
+ if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
+ target_func = ip - offset;
+ else
+ goto fallback;
+
+ for (skip = 0; skip < num_entries; ++skip) {
+ unsigned long func = stack_entries[skip];
+
+ if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
+ goto fallback;
+ func -= offset;
+
+ if (func == target_func) {
+ stack_entries[skip] = ip;
+ return skip;
+ }
+ }
+
+fallback:
+ /* Should not happen; the resulting stack trace is likely misleading. */
+ WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
+ return get_stack_skipnr(stack_entries, num_entries);
+}
+
+static int
+sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip)
+{
+ return ip ? replace_stack_entry(stack_entries, num_entries, ip) :
+ get_stack_skipnr(stack_entries, num_entries);
+}
+
/* Compares symbolized strings of addr1 and addr2. */
static int sym_strcmp(void *addr1, void *addr2)
{
@@ -327,12 +375,12 @@ static void print_verbose_info(struct task_struct *task)
static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
- const struct other_info *other_info,
+ struct other_info *other_info,
u64 old, u64 new, u64 mask)
{
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
- int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
+ int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip);
unsigned long this_frame = stack_entries[skipnr];
unsigned long other_frame = 0;
int other_skipnr = 0; /* silence uninit warnings */
@@ -344,8 +392,9 @@ static void print_report(enum kcsan_value_change value_change,
return;
if (other_info) {
- other_skipnr = get_stack_skipnr(other_info->stack_entries,
- other_info->num_stack_entries);
+ other_skipnr = sanitize_stack_entries(other_info->stack_entries,
+ other_info->num_stack_entries,
+ other_info->ai.ip);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
@@ -576,21 +625,23 @@ discard:
}
static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
- int access_type)
+ int access_type, unsigned long ip)
{
return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
- .cpu_id = raw_smp_processor_id()
+ .cpu_id = raw_smp_processor_id(),
+ /* Only replace stack entry with @ip if scoped access. */
+ .ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
};
}
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
- int watchpoint_idx)
+ unsigned long ip, int watchpoint_idx)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
@@ -603,10 +654,10 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
}
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
- enum kcsan_value_change value_change, int watchpoint_idx,
- u64 old, u64 new, u64 mask)
+ unsigned long ip, enum kcsan_value_change value_change,
+ int watchpoint_idx, u64 old, u64 new, u64 mask)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
struct other_info *other_info = &other_infos[watchpoint_idx];
unsigned long flags = 0;
@@ -637,9 +688,9 @@ out:
}
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
- u64 old, u64 new, u64 mask)
+ unsigned long ip, u64 old, u64 new, u64 mask)
{
- const struct access_info ai = prepare_access_info(ptr, size, access_type);
+ const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index 7f29cb0f5e63..b4295a3892b7 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -18,7 +18,7 @@
#define ITERS_PER_TEST 2000
/* Test requirements. */
-static bool test_requires(void)
+static bool __init test_requires(void)
{
/* random should be initialized for the below tests */
return prandom_u32() + prandom_u32() != 0;
@@ -28,14 +28,18 @@ static bool test_requires(void)
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
*/
-static bool test_encode_decode(void)
+static bool __init test_encode_decode(void)
{
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
bool is_write = !!prandom_u32_max(2);
+ unsigned long verif_masked_addr;
+ long encoded_watchpoint;
+ bool verif_is_write;
unsigned long addr;
+ size_t verif_size;
prandom_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
@@ -44,53 +48,37 @@ static bool test_encode_decode(void)
if (WARN_ON(!check_encodable(addr, size)))
return false;
- /* Encode and decode */
- {
- const long encoded_watchpoint =
- encode_watchpoint(addr, size, is_write);
- unsigned long verif_masked_addr;
- size_t verif_size;
- bool verif_is_write;
-
- /* Check special watchpoints */
- if (WARN_ON(decode_watchpoint(
- INVALID_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(decode_watchpoint(
- CONSUMED_WATCHPOINT, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
-
- /* Check decoding watchpoint returns same data */
- if (WARN_ON(!decode_watchpoint(
- encoded_watchpoint, &verif_masked_addr,
- &verif_size, &verif_is_write)))
- return false;
- if (WARN_ON(verif_masked_addr !=
- (addr & WATCHPOINT_ADDR_MASK)))
- goto fail;
- if (WARN_ON(verif_size != size))
- goto fail;
- if (WARN_ON(is_write != verif_is_write))
- goto fail;
-
- continue;
-fail:
- pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
- __func__, is_write ? "write" : "read", size,
- addr, encoded_watchpoint,
- verif_is_write ? "write" : "read", verif_size,
- verif_masked_addr);
+ encoded_watchpoint = encode_watchpoint(addr, size, is_write);
+
+ /* Check special watchpoints */
+ if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
return false;
- }
+ if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+
+ /* Check decoding watchpoint returns same data */
+ if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
+ return false;
+ if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
+ goto fail;
+ if (WARN_ON(verif_size != size))
+ goto fail;
+ if (WARN_ON(is_write != verif_is_write))
+ goto fail;
+
+ continue;
+fail:
+ pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
+ __func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
+ verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
+ return false;
}
return true;
}
/* Test access matching function. */
-static bool test_matching_access(void)
+static bool __init test_matching_access(void)
{
if (WARN_ON(!matching_access(10, 1, 10, 1)))
return false;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 33400ff051a8..8347fc158d2b 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
+ /*
+ * Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
+ * IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
+ * locate_mem_hole_callback().
+ */
if (kbuf->top_down) {
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
&mstart, &mend, NULL) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4a4d7092a2d8..7113003fab63 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -433,7 +433,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
- * argument. @threadfn() can either call do_exit() directly if it is a
+ * argument. @threadfn() can either return directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 7096384dc60f..2270ec68f10a 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
+/*
+ * Check if an address is part of freed initmem. After initmem is freed,
+ * memory can be allocated from it, and such allocations would then have
+ * addresses within the range [_stext, _end].
+ */
+#ifndef arch_is_kernel_initmem_freed
+static int arch_is_kernel_initmem_freed(unsigned long addr)
+{
+ if (system_state < SYSTEM_FREEING_INITMEM)
+ return 0;
+
+ return init_section_contains((void *)addr, 1);
+}
+#endif
+
static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
@@ -803,9 +818,6 @@ static int static_obj(const void *obj)
if ((addr >= start) && (addr < end))
return 1;
- if (arch_is_kernel_data(addr))
- return 1;
-
/*
* in-kernel percpu var?
*/
diff --git a/kernel/module.c b/kernel/module.c
index 5c26a76e800b..84a9141a5e15 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2942,7 +2942,11 @@ static int module_sig_check(struct load_info *info, int flags)
static int validate_section_offset(struct load_info *info, Elf_Shdr *shdr)
{
+#if defined(CONFIG_64BIT)
+ unsigned long long secend;
+#else
unsigned long secend;
+#endif
/*
* Check for both overflow and offset/size being
@@ -2967,14 +2971,29 @@ static int elf_validity_check(struct load_info *info)
Elf_Shdr *shdr, *strhdr;
int err;
- if (info->len < sizeof(*(info->hdr)))
- return -ENOEXEC;
+ if (info->len < sizeof(*(info->hdr))) {
+ pr_err("Invalid ELF header len %lu\n", info->len);
+ goto no_exec;
+ }
- if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
- || info->hdr->e_type != ET_REL
- || !elf_check_arch(info->hdr)
- || info->hdr->e_shentsize != sizeof(Elf_Shdr))
- return -ENOEXEC;
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0) {
+ pr_err("Invalid ELF header magic: != %s\n", ELFMAG);
+ goto no_exec;
+ }
+ if (info->hdr->e_type != ET_REL) {
+ pr_err("Invalid ELF header type: %u != %u\n",
+ info->hdr->e_type, ET_REL);
+ goto no_exec;
+ }
+ if (!elf_check_arch(info->hdr)) {
+ pr_err("Invalid architecture in ELF header: %u\n",
+ info->hdr->e_machine);
+ goto no_exec;
+ }
+ if (info->hdr->e_shentsize != sizeof(Elf_Shdr)) {
+ pr_err("Invalid ELF section header size\n");
+ goto no_exec;
+ }
/*
* e_shnum is 16 bits, and sizeof(Elf_Shdr) is
@@ -2983,8 +3002,10 @@ static int elf_validity_check(struct load_info *info)
*/
if (info->hdr->e_shoff >= info->len
|| (info->hdr->e_shnum * sizeof(Elf_Shdr) >
- info->len - info->hdr->e_shoff))
- return -ENOEXEC;
+ info->len - info->hdr->e_shoff)) {
+ pr_err("Invalid ELF section header overflow\n");
+ goto no_exec;
+ }
info->sechdrs = (void *)info->hdr + info->hdr->e_shoff;
@@ -2992,13 +3013,19 @@ static int elf_validity_check(struct load_info *info)
* Verify if the section name table index is valid.
*/
if (info->hdr->e_shstrndx == SHN_UNDEF
- || info->hdr->e_shstrndx >= info->hdr->e_shnum)
- return -ENOEXEC;
+ || info->hdr->e_shstrndx >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF section name index: %d || e_shstrndx (%d) >= e_shnum (%d)\n",
+ info->hdr->e_shstrndx, info->hdr->e_shstrndx,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
strhdr = &info->sechdrs[info->hdr->e_shstrndx];
err = validate_section_offset(info, strhdr);
- if (err < 0)
+ if (err < 0) {
+ pr_err("Invalid ELF section hdr(type %u)\n", strhdr->sh_type);
return err;
+ }
/*
* The section name table must be NUL-terminated, as required
@@ -3006,8 +3033,10 @@ static int elf_validity_check(struct load_info *info)
* strings in the section safe.
*/
info->secstrings = (void *)info->hdr + strhdr->sh_offset;
- if (info->secstrings[strhdr->sh_size - 1] != '\0')
- return -ENOEXEC;
+ if (info->secstrings[strhdr->sh_size - 1] != '\0') {
+ pr_err("ELF Spec violation: section name table isn't null terminated\n");
+ goto no_exec;
+ }
/*
* The code assumes that section 0 has a length of zero and
@@ -3015,8 +3044,11 @@ static int elf_validity_check(struct load_info *info)
*/
if (info->sechdrs[0].sh_type != SHT_NULL
|| info->sechdrs[0].sh_size != 0
- || info->sechdrs[0].sh_addr != 0)
- return -ENOEXEC;
+ || info->sechdrs[0].sh_addr != 0) {
+ pr_err("ELF Spec violation: section 0 type(%d)!=SH_NULL or non-zero len or addr\n",
+ info->sechdrs[0].sh_type);
+ goto no_exec;
+ }
for (i = 1; i < info->hdr->e_shnum; i++) {
shdr = &info->sechdrs[i];
@@ -3026,8 +3058,12 @@ static int elf_validity_check(struct load_info *info)
continue;
case SHT_SYMTAB:
if (shdr->sh_link == SHN_UNDEF
- || shdr->sh_link >= info->hdr->e_shnum)
- return -ENOEXEC;
+ || shdr->sh_link >= info->hdr->e_shnum) {
+ pr_err("Invalid ELF sh_link!=SHN_UNDEF(%d) or (sh_link(%d) >= hdr->e_shnum(%d)\n",
+ shdr->sh_link, shdr->sh_link,
+ info->hdr->e_shnum);
+ goto no_exec;
+ }
fallthrough;
default:
err = validate_section_offset(info, shdr);
@@ -3049,6 +3085,9 @@ static int elf_validity_check(struct load_info *info)
}
return 0;
+
+no_exec:
+ return -ENOEXEC;
}
#define COPY_CHUNK_SIZE (16*PAGE_SIZE)
@@ -3940,10 +3979,8 @@ static int load_module(struct load_info *info, const char __user *uargs,
* sections.
*/
err = elf_validity_check(info);
- if (err) {
- pr_err("Module has invalid ELF structures\n");
+ if (err)
goto free_copy;
- }
/*
* Everything checks out, so set up the section info
diff --git a/kernel/pid.c b/kernel/pid.c
index efe87db44683..2fc0a16ec77b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -540,6 +540,42 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
}
/**
+ * pidfd_get_task() - Get the task associated with a pidfd
+ *
+ * @pidfd: pidfd for which to get the task
+ * @flags: flags associated with this pidfd
+ *
+ * Return the task associated with @pidfd. The function takes a reference on
+ * the returned task. The caller is responsible for releasing that reference.
+ *
+ * Currently, the process identified by @pidfd is always a thread-group leader.
+ * This restriction currently exists for all aspects of pidfds including pidfd
+ * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling
+ * (only supports thread group leaders).
+ *
+ * Return: On success, the task_struct associated with the pidfd.
+ * On error, a negative errno number will be returned.
+ */
+struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags)
+{
+ unsigned int f_flags;
+ struct pid *pid;
+ struct task_struct *task;
+
+ pid = pidfd_get_pid(pidfd, &f_flags);
+ if (IS_ERR(pid))
+ return ERR_CAST(pid);
+
+ task = get_pid_task(pid, PIDTYPE_TGID);
+ put_pid(pid);
+ if (!task)
+ return ERR_PTR(-ESRCH);
+
+ *flags = f_flags;
+ return task;
+}
+
+/**
* pidfd_create() - Create a new pid file descriptor.
*
* @pid: struct pid that the pidfd will reference
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index a332ccd829e2..0153b0ca7b23 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -2,7 +2,7 @@
/*
* Energy Model of devices
*
- * Copyright (c) 2018-2020, Arm ltd.
+ * Copyright (c) 2018-2021, Arm ltd.
* Written by: Quentin Perret, Arm ltd.
* Improvements provided by: Lukasz Luba, Arm ltd.
*/
@@ -10,6 +10,7 @@
#define pr_fmt(fmt) "energy_model: " fmt
#include <linux/cpu.h>
+#include <linux/cpufreq.h>
#include <linux/cpumask.h>
#include <linux/debugfs.h>
#include <linux/energy_model.h>
@@ -42,6 +43,7 @@ static void em_debug_create_ps(struct em_perf_state *ps, struct dentry *pd)
debugfs_create_ulong("frequency", 0444, d, &ps->frequency);
debugfs_create_ulong("power", 0444, d, &ps->power);
debugfs_create_ulong("cost", 0444, d, &ps->cost);
+ debugfs_create_ulong("inefficient", 0444, d, &ps->flags);
}
static int em_debug_cpus_show(struct seq_file *s, void *unused)
@@ -55,7 +57,8 @@ DEFINE_SHOW_ATTRIBUTE(em_debug_cpus);
static int em_debug_units_show(struct seq_file *s, void *unused)
{
struct em_perf_domain *pd = s->private;
- char *units = pd->milliwatts ? "milliWatts" : "bogoWatts";
+ char *units = (pd->flags & EM_PERF_DOMAIN_MILLIWATTS) ?
+ "milliWatts" : "bogoWatts";
seq_printf(s, "%s\n", units);
@@ -63,6 +66,17 @@ static int em_debug_units_show(struct seq_file *s, void *unused)
}
DEFINE_SHOW_ATTRIBUTE(em_debug_units);
+static int em_debug_skip_inefficiencies_show(struct seq_file *s, void *unused)
+{
+ struct em_perf_domain *pd = s->private;
+ int enabled = (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES) ? 1 : 0;
+
+ seq_printf(s, "%d\n", enabled);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(em_debug_skip_inefficiencies);
+
static void em_debug_create_pd(struct device *dev)
{
struct dentry *d;
@@ -76,6 +90,8 @@ static void em_debug_create_pd(struct device *dev)
&em_debug_cpus_fops);
debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops);
+ debugfs_create_file("skip-inefficiencies", 0444, d, dev->em_pd,
+ &em_debug_skip_inefficiencies_fops);
/* Create a sub-directory for each performance state */
for (i = 0; i < dev->em_pd->nr_perf_states; i++)
@@ -107,8 +123,7 @@ static void em_debug_remove_pd(struct device *dev) {}
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
int nr_states, struct em_data_callback *cb)
{
- unsigned long opp_eff, prev_opp_eff = ULONG_MAX;
- unsigned long power, freq, prev_freq = 0;
+ unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
int i, ret;
u64 fmax;
@@ -153,27 +168,22 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
table[i].power = power;
table[i].frequency = prev_freq = freq;
-
- /*
- * The hertz/watts efficiency ratio should decrease as the
- * frequency grows on sane platforms. But this isn't always
- * true in practice so warn the user if a higher OPP is more
- * power efficient than a lower one.
- */
- opp_eff = freq / power;
- if (opp_eff >= prev_opp_eff)
- dev_dbg(dev, "EM: hertz/watts ratio non-monotonically decreasing: em_perf_state %d >= em_perf_state%d\n",
- i, i - 1);
- prev_opp_eff = opp_eff;
}
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
- for (i = 0; i < nr_states; i++) {
+ for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res = em_scale_power(table[i].power);
table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
+ if (table[i].cost >= prev_cost) {
+ table[i].flags = EM_PERF_STATE_INEFFICIENT;
+ dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
+ table[i].frequency);
+ } else {
+ prev_cost = table[i].cost;
+ }
}
pd->table = table;
@@ -222,6 +232,43 @@ static int em_create_pd(struct device *dev, int nr_states,
return 0;
}
+static void em_cpufreq_update_efficiencies(struct device *dev)
+{
+ struct em_perf_domain *pd = dev->em_pd;
+ struct em_perf_state *table;
+ struct cpufreq_policy *policy;
+ int found = 0;
+ int i;
+
+ if (!_is_cpu_device(dev) || !pd)
+ return;
+
+ policy = cpufreq_cpu_get(cpumask_first(em_span_cpus(pd)));
+ if (!policy) {
+ dev_warn(dev, "EM: Access to CPUFreq policy failed");
+ return;
+ }
+
+ table = pd->table;
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (!(table[i].flags & EM_PERF_STATE_INEFFICIENT))
+ continue;
+
+ if (!cpufreq_table_set_inefficient(policy, table[i].frequency))
+ found++;
+ }
+
+ if (!found)
+ return;
+
+ /*
+ * Efficiencies have been installed in CPUFreq, inefficient frequencies
+ * will be skipped. The EM can do the same.
+ */
+ pd->flags |= EM_PERF_DOMAIN_SKIP_INEFFICIENCIES;
+}
+
/**
* em_pd_get() - Return the performance domain for a device
* @dev : Device to find the performance domain for
@@ -335,7 +382,10 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
if (ret)
goto unlock;
- dev->em_pd->milliwatts = milliwatts;
+ if (milliwatts)
+ dev->em_pd->flags |= EM_PERF_DOMAIN_MILLIWATTS;
+
+ em_cpufreq_update_efficiencies(dev);
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 559acef3fddb..9ed9b744876c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -300,7 +300,7 @@ static int create_image(int platform_mode)
if (error || hibernation_test(TEST_PLATFORM))
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || hibernation_test(TEST_CPUS))
goto Enable_cpus;
@@ -342,7 +342,7 @@ static int create_image(int platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
/* Allow architectures to do nosmt-specific post-resume dances */
if (!in_suspend)
@@ -466,6 +466,8 @@ static int resume_target_kernel(bool platform_mode)
if (error)
goto Cleanup;
+ cpuidle_pause();
+
error = hibernate_resume_nonboot_cpu_disable();
if (error)
goto Enable_cpus;
@@ -509,7 +511,7 @@ static int resume_target_kernel(bool platform_mode)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Cleanup:
platform_restore_cleanup(platform_mode);
@@ -587,7 +589,7 @@ int hibernation_platform_enter(void)
if (error)
goto Platform_finish;
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error)
goto Enable_cpus;
@@ -609,7 +611,7 @@ int hibernation_platform_enter(void)
local_irq_enable();
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_finish:
hibernation_ops->finish();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 778bf431ec02..326f8d032eb5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -4,6 +4,8 @@
#include <linux/utsname.h>
#include <linux/freezer.h>
#include <linux/compiler.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
struct swsusp_info {
struct new_utsname uts;
@@ -310,3 +312,15 @@ extern int pm_wake_lock(const char *buf);
extern int pm_wake_unlock(const char *buf);
#endif /* !CONFIG_PM_WAKELOCKS */
+
+static inline int pm_sleep_disable_secondary_cpus(void)
+{
+ cpuidle_pause();
+ return suspend_disable_secondary_cpus();
+}
+
+static inline void pm_sleep_enable_secondary_cpus(void)
+{
+ suspend_enable_secondary_cpus();
+ cpuidle_resume();
+}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 37401c99b7d7..b7e7798637b8 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -94,7 +94,7 @@ static int try_to_freeze_tasks(bool user_only)
todo - wq_busy, wq_busy);
if (wq_busy)
- show_workqueue_state();
+ show_all_workqueues();
if (!wakeup || pm_debug_messages_on) {
read_lock(&tasklist_lock);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index eb75f394a059..80cc1f0f502b 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -97,7 +97,6 @@ static void s2idle_enter(void)
raw_spin_unlock_irq(&s2idle_lock);
cpus_read_lock();
- cpuidle_resume();
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
@@ -105,7 +104,6 @@ static void s2idle_enter(void)
swait_event_exclusive(s2idle_wait_head,
s2idle_state == S2IDLE_STATE_WAKE);
- cpuidle_pause();
cpus_read_unlock();
raw_spin_lock_irq(&s2idle_lock);
@@ -162,11 +160,13 @@ EXPORT_SYMBOL_GPL(s2idle_wake);
static bool valid_state(suspend_state_t state)
{
/*
- * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
- * support and need to be valid to the low level
- * implementation, no valid callback implies that none are valid.
+ * The PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states require low-level
+ * support and need to be valid to the low-level implementation.
+ *
+ * No ->valid() or ->enter() callback implies that none are valid.
*/
- return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
+ return suspend_ops && suspend_ops->valid && suspend_ops->valid(state) &&
+ suspend_ops->enter;
}
void __init pm_states_init(void)
@@ -238,7 +238,7 @@ EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
static bool sleep_state_supported(suspend_state_t state)
{
- return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
+ return state == PM_SUSPEND_TO_IDLE || valid_state(state);
}
static int platform_suspend_prepare(suspend_state_t state)
@@ -422,7 +422,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- error = suspend_disable_secondary_cpus();
+ error = pm_sleep_disable_secondary_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -452,7 +452,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
BUG_ON(irqs_disabled());
Enable_cpus:
- suspend_enable_secondary_cpus();
+ pm_sleep_enable_secondary_cpus();
Platform_wake:
platform_resume_noirq(state);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 3cb89baebc79..ff326c2cb77b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -299,7 +299,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
return error;
}
-static blk_status_t hib_wait_io(struct hib_bio_batch *hb)
+static int hib_wait_io(struct hib_bio_batch *hb)
{
/*
* We are relying on the behavior of blk_plug that a thread with
@@ -705,22 +705,19 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct cmp_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
/*
* Start the compression threads.
@@ -1198,22 +1195,19 @@ static int load_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
- data = vmalloc(array_size(nr_threads, sizeof(*data)));
+ data = vzalloc(array_size(nr_threads, sizeof(*data)));
if (!data) {
pr_err("Failed to allocate LZO data\n");
ret = -ENOMEM;
goto out_clean;
}
- for (thr = 0; thr < nr_threads; thr++)
- memset(&data[thr], 0, offsetof(struct dec_data, go));
- crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+ crc = kzalloc(sizeof(*crc), GFP_KERNEL);
if (!crc) {
pr_err("Failed to allocate crc\n");
ret = -ENOMEM;
goto out_clean;
}
- memset(crc, 0, offsetof(struct crc_data, go));
clean_pages_on_decompress = true;
@@ -1521,9 +1515,10 @@ end:
int swsusp_check(void)
{
int error;
+ void *holder;
hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device,
- FMODE_READ, NULL);
+ FMODE_READ | FMODE_EXCL, &holder);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
clear_page(swsusp_header);
@@ -1545,7 +1540,7 @@ int swsusp_check(void)
put:
if (error)
- blkdev_put(hib_resume_bdev, FMODE_READ);
+ blkdev_put(hib_resume_bdev, FMODE_READ | FMODE_EXCL);
else
pr_debug("Image signature found, resuming\n");
} else {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 49e3c53ec510..57b132b658e1 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early)
return;
err_free_descs:
- memblock_free_ptr(new_descs, new_descs_size);
+ memblock_free(new_descs, new_descs_size);
err_free_log_buf:
- memblock_free_ptr(new_log_buf, new_log_buf_len);
+ memblock_free(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
diff --git a/kernel/reboot.c b/kernel/reboot.c
index f7440c0c7e43..6bcc5d6a6572 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(cad_pid);
#define DEFAULT_REBOOT_MODE
#endif
enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE;
+EXPORT_SYMBOL_GPL(reboot_mode);
enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED;
/*
@@ -359,7 +360,6 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
case LINUX_REBOOT_CMD_HALT:
kernel_halt();
do_exit(0);
- panic("cannot halt");
case LINUX_REBOOT_CMD_POWER_OFF:
kernel_power_off();
diff --git a/kernel/resource.c b/kernel/resource.c
index ca9f5198a01f..5ad3eba619ba 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -73,6 +73,18 @@ static struct resource *next_resource(struct resource *p)
return p->sibling;
}
+static struct resource *next_resource_skip_children(struct resource *p)
+{
+ while (!p->sibling && p->parent)
+ p = p->parent;
+ return p->sibling;
+}
+
+#define for_each_resource(_root, _p, _skip_children) \
+ for ((_p) = (_root)->child; (_p); \
+ (_p) = (_skip_children) ? next_resource_skip_children(_p) : \
+ next_resource(_p))
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -1707,37 +1719,49 @@ static int strict_iomem_checks;
#endif
/*
- * check if an address is reserved in the iomem resource tree
- * returns true if reserved, false if not reserved.
+ * Check if an address is exclusive to the kernel and must not be mapped to
+ * user space, for example, via /dev/mem.
+ *
+ * Returns true if exclusive to the kernel, otherwise returns false.
*/
bool iomem_is_exclusive(u64 addr)
{
- struct resource *p = &iomem_resource;
- bool err = false;
- loff_t l;
+ const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM |
+ IORESOURCE_EXCLUSIVE;
+ bool skip_children = false, err = false;
int size = PAGE_SIZE;
-
- if (!strict_iomem_checks)
- return false;
+ struct resource *p;
addr = addr & PAGE_MASK;
read_lock(&resource_lock);
- for (p = p->child; p ; p = r_next(NULL, p, &l)) {
- /*
- * We can probably skip the resources without
- * IORESOURCE_IO attribute?
- */
+ for_each_resource(&iomem_resource, p, skip_children) {
if (p->start >= addr + size)
break;
- if (p->end < addr)
+ if (p->end < addr) {
+ skip_children = true;
continue;
+ }
+ skip_children = false;
+
+ /*
+ * IORESOURCE_SYSTEM_RAM resources are exclusive if
+ * IORESOURCE_EXCLUSIVE is set, even if they
+ * are not busy and even if "iomem=relaxed" is set. The
+ * responsible driver dynamically adds/removes system RAM within
+ * such an area and uncontrolled access is dangerous.
+ */
+ if ((p->flags & exclusive_system_ram) == exclusive_system_ram) {
+ err = true;
+ break;
+ }
+
/*
* A resource is exclusive if IORESOURCE_EXCLUSIVE is set
* or CONFIG_IO_STRICT_DEVMEM is enabled and the
* resource is busy.
*/
- if ((p->flags & IORESOURCE_BUSY) == 0)
+ if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY))
continue;
if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM)
|| p->flags & IORESOURCE_EXCLUSIVE) {
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2067080bb235..8629b37d118e 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 523fd602ea90..3c9b0fda64ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3726,6 +3726,9 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
@@ -6625,13 +6628,13 @@ __setup("preempt=", setup_preempt_mode);
static void __init preempt_dynamic_init(void)
{
if (preempt_dynamic_mode == preempt_dynamic_undefined) {
- if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
sched_dynamic_update(preempt_dynamic_none);
- } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) {
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
} else {
/* Default static call setting, nothing to do */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR));
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
preempt_dynamic_mode = preempt_dynamic_full;
pr_info("Dynamic Preempt: full\n");
}
@@ -9716,6 +9719,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -9759,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
- sched_free_group(container_of(rhp, struct task_group, rcu));
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
/* Wait for possible concurrent references to cfs_rqs complete: */
- call_rcu(&tg->rcu, sched_free_group_rcu);
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- /* End participation in shares distribution: */
- unregister_fair_sched_group(tg);
-
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
@@ -9896,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_offline_group(tg);
+ sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -9906,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/*
* Relies on the RCU grace period between css_released() and this.
*/
- sched_free_group(tg);
+ sched_unregister_group(tg);
}
/*
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c
index 48ac72696012..517f72b008f5 100644
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -135,6 +135,10 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
if (!static_branch_likely(&sched_smt_present))
return -ENODEV;
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD != PIDTYPE_PID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_THREAD_GROUP != PIDTYPE_TGID);
+ BUILD_BUG_ON(PR_SCHED_CORE_SCOPE_PROCESS_GROUP != PIDTYPE_PGID);
+
if (type > PIDTYPE_PGID || cmd >= PR_SCHED_CORE_MAX || pid < 0 ||
(cmd != PR_SCHED_CORE_GET && uaddr))
return -EINVAL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 13950beb01a2..6e476f6d9435 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11456,8 +11456,6 @@ void free_fair_sched_group(struct task_group *tg)
{
int i;
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -11534,6 +11532,8 @@ void unregister_fair_sched_group(struct task_group *tg)
struct rq *rq;
int cpu;
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bb945f8faeca..b48baaba2fc2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -137,13 +137,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
return rt_rq->rq;
}
-void free_rt_sched_group(struct task_group *tg)
+void unregister_rt_sched_group(struct task_group *tg)
{
- int i;
-
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -250,6 +254,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void unregister_rt_sched_group(struct task_group *tg) { }
+
void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f1612d26c18..0e66749486e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -488,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg);
extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
@@ -503,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
+extern void sched_release_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 30169c7685b6..d201a7052a29 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1492,7 +1492,6 @@ static int sched_domains_curr_level;
int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
-int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index e99aff33ff14..7c4b7ae714d4 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1323,6 +1323,7 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, bool
blocked = sigismember(&t->blocked, sig);
if (blocked || ignored || sigdfl) {
action->sa.sa_handler = SIG_DFL;
+ action->sa.sa_flags |= SA_IMMUTABLE;
if (blocked) {
sigdelset(&t->blocked, sig);
recalc_sigpending_and_wake(t);
@@ -1649,6 +1650,19 @@ void force_sig(int sig)
}
EXPORT_SYMBOL(force_sig);
+void force_fatal_sig(int sig)
+{
+ struct kernel_siginfo info;
+
+ clear_siginfo(&info);
+ info.si_signo = sig;
+ info.si_errno = 0;
+ info.si_code = SI_KERNEL;
+ info.si_pid = 0;
+ info.si_uid = 0;
+ force_sig_info_to_task(&info, current, true);
+}
+
/*
* When things go south during signal handling, we
* will force a SIGSEGV. And if the signal that caused
@@ -1657,15 +1671,10 @@ EXPORT_SYMBOL(force_sig);
*/
void force_sigsegv(int sig)
{
- struct task_struct *p = current;
-
- if (sig == SIGSEGV) {
- unsigned long flags;
- spin_lock_irqsave(&p->sighand->siglock, flags);
- p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL;
- spin_unlock_irqrestore(&p->sighand->siglock, flags);
- }
- force_sig(SIGSEGV);
+ if (sig == SIGSEGV)
+ force_fatal_sig(SIGSEGV);
+ else
+ force_sig(SIGSEGV);
}
int force_sig_fault_to_task(int sig, int code, void __user *addr
@@ -2145,40 +2154,6 @@ static void do_notify_parent_cldstop(struct task_struct *tsk,
spin_unlock_irqrestore(&sighand->siglock, flags);
}
-static inline bool may_ptrace_stop(void)
-{
- if (!likely(current->ptrace))
- return false;
- /*
- * Are we in the middle of do_coredump?
- * If so and our tracer is also part of the coredump stopping
- * is a deadlock situation, and pointless because our tracer
- * is dead so don't allow us to stop.
- * If SIGKILL was already sent before the caller unlocked
- * ->siglock we must see ->core_state != NULL. Otherwise it
- * is safe to enter schedule().
- *
- * This is almost outdated, a task with the pending SIGKILL can't
- * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
- * after SIGKILL was already dequeued.
- */
- if (unlikely(current->mm->core_state) &&
- unlikely(current->mm == current->parent->mm))
- return false;
-
- return true;
-}
-
-/*
- * Return non-zero if there is a SIGKILL that should be waking us up.
- * Called with the siglock held.
- */
-static bool sigkill_pending(struct task_struct *tsk)
-{
- return sigismember(&tsk->pending.signal, SIGKILL) ||
- sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
-}
-
/*
* This must be called with current->sighand->siglock held.
*
@@ -2196,7 +2171,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
{
bool gstop_done = false;
- if (arch_ptrace_stop_needed(exit_code, info)) {
+ if (arch_ptrace_stop_needed()) {
/*
* The arch code has something special to do before a
* ptrace stop. This is allowed to block, e.g. for faults
@@ -2204,17 +2179,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
* calling arch_ptrace_stop, so we must release it now.
* To preserve proper semantics, we must do this before
* any signal bookkeeping like checking group_stop_count.
- * Meanwhile, a SIGKILL could come in before we retake the
- * siglock. That must prevent us from sleeping in TASK_TRACED.
- * So after regaining the lock, we must check for SIGKILL.
*/
spin_unlock_irq(&current->sighand->siglock);
- arch_ptrace_stop(exit_code, info);
+ arch_ptrace_stop();
spin_lock_irq(&current->sighand->siglock);
- if (sigkill_pending(current))
- return;
}
+ /*
+ * schedule() will not sleep if there is a pending signal that
+ * can awaken the task.
+ */
set_special_state(TASK_TRACED);
/*
@@ -2260,7 +2234,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
- if (may_ptrace_stop()) {
+ if (likely(current->ptrace)) {
/*
* Notify parents of the stop.
*
@@ -2739,7 +2713,8 @@ relock:
if (!signr)
break; /* will return 0 */
- if (unlikely(current->ptrace) && signr != SIGKILL) {
+ if (unlikely(current->ptrace) && (signr != SIGKILL) &&
+ !(sighand->action[signr -1].sa.sa_flags & SA_IMMUTABLE)) {
signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
@@ -4089,6 +4064,10 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
k = &p->sighand->action[sig-1];
spin_lock_irq(&p->sighand->siglock);
+ if (k->sa.sa_flags & SA_IMMUTABLE) {
+ spin_unlock_irq(&p->sighand->siglock);
+ return -EINVAL;
+ }
if (oact)
*oact = *k;
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfdd..9c625257023d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
#include <linux/export.h>
#include <linux/kallsyms.h>
#include <linux/stacktrace.h>
+#include <linux/interrupt.h>
/**
* stack_trace_print - Print the entries in the stack trace
@@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
#endif /* CONFIG_USER_STACKTRACE_SUPPORT */
#endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries: Pointer to stack trace array
+ * @nr_entries: Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
+ /* Include the irqentry function into the stack. */
+ return i + 1;
+ }
+ }
+ return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 643d412ac623..96b4e7810426 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1159,13 +1159,28 @@ static void posix_cpu_timers_work(struct callback_head *work)
}
/*
+ * Clear existing posix CPU timers task work.
+ */
+void clear_posix_cputimers_work(struct task_struct *p)
+{
+ /*
+ * A copied work entry from the old task is not meaningful, clear it.
+ * N.B. init_task_work will not do this.
+ */
+ memset(&p->posix_cputimers_work.work, 0,
+ sizeof(p->posix_cputimers_work.work));
+ init_task_work(&p->posix_cputimers_work.work,
+ posix_cpu_timers_work);
+ p->posix_cputimers_work.scheduled = false;
+}
+
+/*
* Initialize posix CPU timers task work in init task. Out of line to
* keep the callback static and to avoid header recursion hell.
*/
void __init posix_cputimers_init_work(void)
{
- init_task_work(&current->posix_cputimers_work.work,
- posix_cpu_timers_work);
+ clear_posix_cputimers_work(current);
}
/*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3ea4e20072f..30bc880c3849 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -318,7 +318,7 @@ int __register_ftrace_function(struct ftrace_ops *ops)
if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT))
return -EBUSY;
- if (!core_kernel_data((unsigned long)ops))
+ if (!is_kernel_core_data((unsigned long)ops))
ops->flags |= FTRACE_OPS_FL_DYNAMIC;
add_ftrace_ops(&ftrace_ops_list, ops);
@@ -5602,10 +5602,11 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
}
+ mutex_unlock(&ftrace_lock);
+
/* Removing the tmp_ops will add the updated direct callers to the functions */
unregister_ftrace_function(&tmp_ops);
- mutex_unlock(&ftrace_lock);
out_direct:
mutex_unlock(&direct_mutex);
return err;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f6520d0a4c8c..2699e9e562b1 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5228,6 +5228,9 @@ void ring_buffer_reset(struct trace_buffer *buffer)
struct ring_buffer_per_cpu *cpu_buffer;
int cpu;
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+
for_each_buffer_cpu(buffer, cpu) {
cpu_buffer = buffer->buffers[cpu];
@@ -5246,6 +5249,8 @@ void ring_buffer_reset(struct trace_buffer *buffer)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&cpu_buffer->resize_disabled);
}
+
+ mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c88bbfe75d1d..f9139dc1262c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5605,6 +5605,7 @@ static const char readme_msg[] =
#ifdef CONFIG_HIST_TRIGGERS
" hist trigger\t- If set, event hits are aggregated into a hash table\n"
"\t Format: hist:keys=<field1[,field2,...]>\n"
+ "\t [:<var1>=<field|var_ref|numeric_literal>[,<var2>=...]]\n"
"\t [:values=<field1[,field2,...]>]\n"
"\t [:sort=<field1[,field2,...]>]\n"
"\t [:size=#entries]\n"
@@ -5616,6 +5617,16 @@ static const char readme_msg[] =
"\t common_timestamp - to record current timestamp\n"
"\t common_cpu - to record the CPU the event happened on\n"
"\n"
+ "\t A hist trigger variable can be:\n"
+ "\t - a reference to a field e.g. x=current_timestamp,\n"
+ "\t - a reference to another variable e.g. y=$x,\n"
+ "\t - a numeric literal: e.g. ms_per_sec=1000,\n"
+ "\t - an arithmetic expression: e.g. time_secs=current_timestamp/1000\n"
+ "\n"
+ "\t hist trigger aritmethic expressions support addition(+), subtraction(-),\n"
+ "\t multiplication(*) and division(/) operators. An operand can be either a\n"
+ "\t variable reference, field or numeric literal.\n"
+ "\n"
"\t When a matching event is hit, an entry is added to a hash\n"
"\t table using the key(s) and value(s) named, and the value of a\n"
"\t sum called 'hitcount' is incremented. Keys and values\n"
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 61586f16a853..5ea2c9ec54a6 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -68,7 +68,8 @@
C(INVALID_SORT_FIELD, "Sort field must be a key or a val"), \
C(INVALID_STR_OPERAND, "String type can not be an operand in expression"), \
C(EXPECT_NUMBER, "Expecting numeric literal"), \
- C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"),
+ C(UNARY_MINUS_SUBEXPR, "Unary minus not supported in sub-expressions"), \
+ C(DIVISION_BY_ZERO, "Division by zero"),
#undef C
#define C(a, b) HIST_ERR_##a
@@ -92,6 +93,7 @@ typedef u64 (*hist_field_fn_t) (struct hist_field *field,
#define HIST_FIELDS_MAX (TRACING_MAP_FIELDS_MAX + TRACING_MAP_VARS_MAX)
#define HIST_ACTIONS_MAX 8
#define HIST_CONST_DIGITS_MAX 21
+#define HIST_DIV_SHIFT 20 /* For optimizing division by constants */
enum field_op_id {
FIELD_OP_NONE,
@@ -160,6 +162,8 @@ struct hist_field {
/* Numeric literals are represented as u64 */
u64 constant;
+ /* Used to optimize division by constants */
+ u64 div_multiplier;
};
static u64 hist_field_none(struct hist_field *field,
@@ -311,6 +315,68 @@ static u64 hist_field_div(struct hist_field *hist_field,
return div64_u64(val1, val2);
}
+static u64 div_by_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ return val1 >> __ffs64(operand2->constant);
+}
+
+static u64 div_by_not_power_of_two(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ return div64_u64(val1, operand2->constant);
+}
+
+static u64 div_by_mult_and_shift(struct hist_field *hist_field,
+ struct tracing_map_elt *elt,
+ struct trace_buffer *buffer,
+ struct ring_buffer_event *rbe,
+ void *event)
+{
+ struct hist_field *operand1 = hist_field->operands[0];
+ struct hist_field *operand2 = hist_field->operands[1];
+
+ u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event);
+
+ /*
+ * If the divisor is a constant, do a multiplication and shift instead.
+ *
+ * Choose Z = some power of 2. If Y <= Z, then:
+ * X / Y = (X * (Z / Y)) / Z
+ *
+ * (Z / Y) is a constant (mult) which is calculated at parse time, so:
+ * X / Y = (X * mult) / Z
+ *
+ * The division by Z can be replaced by a shift since Z is a power of 2:
+ * X / Y = (X * mult) >> HIST_DIV_SHIFT
+ *
+ * As long, as X < Z the results will not be off by more than 1.
+ */
+ if (val1 < (1 << HIST_DIV_SHIFT)) {
+ u64 mult = operand2->div_multiplier;
+
+ return (val1 * mult + ((1 << HIST_DIV_SHIFT) - 1)) >> HIST_DIV_SHIFT;
+ }
+
+ return div64_u64(val1, operand2->constant);
+}
+
static u64 hist_field_mult(struct hist_field *hist_field,
struct tracing_map_elt *elt,
struct trace_buffer *buffer,
@@ -573,6 +639,25 @@ struct snapshot_context {
void *key;
};
+/*
+ * Returns the specific division function to use if the divisor
+ * is constant. This avoids extra branches when the trigger is hit.
+ */
+static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor)
+{
+ u64 div = divisor->constant;
+
+ if (!(div & (div - 1)))
+ return div_by_power_of_two;
+
+ /* If the divisor is too large, do a regular division */
+ if (div > (1 << HIST_DIV_SHIFT))
+ return div_by_not_power_of_two;
+
+ divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div);
+ return div_by_mult_and_shift;
+}
+
static void track_data_free(struct track_data *track_data)
{
struct hist_elt_data *elt_data;
@@ -1868,9 +1953,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field->type)
goto free;
- if (field->filter_type == FILTER_STATIC_STRING)
+ if (field->filter_type == FILTER_STATIC_STRING) {
hist_field->fn = hist_field_string;
- else if (field->filter_type == FILTER_DYN_STRING)
+ hist_field->size = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
hist_field->fn = hist_field_dynstring;
else
hist_field->fn = hist_field_pstring;
@@ -2495,7 +2581,8 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
operand1_str = str;
str = sep+1;
- if (!operand1_str || !str)
+ /* Binary operator requires both operands */
+ if (*operand1_str == '\0' || *str == '\0')
goto free;
operand_flags = 0;
@@ -2575,6 +2662,24 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1;
expr->operands[1] = operand2;
+ if (field_op == FIELD_OP_DIV &&
+ operand2_flags & HIST_FIELD_FL_CONST) {
+ u64 divisor = var2 ? var2->constant : operand2->constant;
+
+ if (!divisor) {
+ hist_err(file->tr, HIST_ERR_DIVISION_BY_ZERO, errpos(str));
+ ret = -EDOM;
+ goto free;
+ }
+
+ /*
+ * Copy the divisor here so we don't have to look it up
+ * later if this is a var ref
+ */
+ operand2->constant = divisor;
+ op_fn = hist_field_get_div_fn(operand2);
+ }
+
if (combine_consts) {
if (var1)
expr->operands[0] = var1;
@@ -2921,8 +3026,10 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
if (val->flags & HIST_FIELD_FL_STRING) {
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
+ unsigned int size;
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+ size = min(val->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -4809,6 +4916,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
if (hist_field->flags & HIST_FIELD_FL_STRING) {
unsigned int str_start, var_str_idx, idx;
char *str, *val_str;
+ unsigned int size;
str_start = hist_data->n_field_var_str +
hist_data->n_save_var_str;
@@ -4817,7 +4925,9 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
str = elt_data->field_var_str[idx];
val_str = (char *)(uintptr_t)hist_val;
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+
+ size = min(hist_field->size, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, size);
hist_val = (u64)(uintptr_t)str;
}
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index d11b41784fac..7520d43aed55 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -38,8 +38,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/osnoise.h>
-static struct trace_array *osnoise_trace;
-
/*
* Default values.
*/
@@ -51,6 +49,100 @@ static struct trace_array *osnoise_trace;
#define DEFAULT_TIMERLAT_PRIO 95 /* FIFO 95 */
/*
+ * trace_array of the enabled osnoise/timerlat instances.
+ */
+struct osnoise_instance {
+ struct list_head list;
+ struct trace_array *tr;
+};
+
+static struct list_head osnoise_instances;
+
+static bool osnoise_has_registered_instances(void)
+{
+ return !!list_first_or_null_rcu(&osnoise_instances,
+ struct osnoise_instance,
+ list);
+}
+
+/*
+ * osnoise_instance_registered - check if a tr is already registered
+ */
+static int osnoise_instance_registered(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+ int found = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ if (inst->tr == tr)
+ found = 1;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+/*
+ * osnoise_register_instance - register a new trace instance
+ *
+ * Register a trace_array *tr in the list of instances running
+ * osnoise/timerlat tracers.
+ */
+static int osnoise_register_instance(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+
+ /*
+ * register/unregister serialization is provided by trace's
+ * trace_types_lock.
+ */
+ lockdep_assert_held(&trace_types_lock);
+
+ inst = kmalloc(sizeof(*inst), GFP_KERNEL);
+ if (!inst)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD_RCU(&inst->list);
+ inst->tr = tr;
+ list_add_tail_rcu(&inst->list, &osnoise_instances);
+
+ return 0;
+}
+
+/*
+ * osnoise_unregister_instance - unregister a registered trace instance
+ *
+ * Remove the trace_array *tr from the list of instances running
+ * osnoise/timerlat tracers.
+ */
+static void osnoise_unregister_instance(struct trace_array *tr)
+{
+ struct osnoise_instance *inst;
+ int found = 0;
+
+ /*
+ * register/unregister serialization is provided by trace's
+ * trace_types_lock.
+ */
+ lockdep_assert_held(&trace_types_lock);
+
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ if (inst->tr == tr) {
+ list_del_rcu(&inst->list);
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found)
+ return;
+
+ synchronize_rcu();
+ kfree(inst);
+}
+
+/*
* NMI runtime info.
*/
struct osn_nmi {
@@ -248,10 +340,56 @@ static struct osnoise_data {
#endif
};
-/*
- * Boolean variable used to inform that the tracer is currently sampling.
- */
-static bool osnoise_busy;
+#ifdef CONFIG_TIMERLAT_TRACER
+static inline bool timerlat_enabled(void)
+{
+ return osnoise_data.timerlat_tracer;
+}
+
+static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
+{
+ struct timerlat_variables *tlat_var = this_cpu_tmr_var();
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (!tlat_var->tracing_thread) {
+ osn_var->softirq.arrival_time = 0;
+ osn_var->softirq.delta_start = 0;
+ return 0;
+ }
+ return 1;
+}
+
+static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
+{
+ struct timerlat_variables *tlat_var = this_cpu_tmr_var();
+ /*
+ * If the timerlat is enabled, but the irq handler did
+ * not run yet enabling timerlat_tracer, do not trace.
+ */
+ if (!tlat_var->tracing_thread) {
+ osn_var->thread.delta_start = 0;
+ osn_var->thread.arrival_time = 0;
+ return 0;
+ }
+ return 1;
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static inline bool timerlat_enabled(void)
+{
+ return false;
+}
+
+static inline int timerlat_softirq_exit(struct osnoise_variables *osn_var)
+{
+ return 1;
+}
+static inline int timerlat_thread_exit(struct osnoise_variables *osn_var)
+{
+ return 1;
+}
+#endif
#ifdef CONFIG_PREEMPT_RT
/*
@@ -315,19 +453,24 @@ static void print_osnoise_headers(struct seq_file *s)
* osnoise_taint - report an osnoise error.
*/
#define osnoise_taint(msg) ({ \
- struct trace_array *tr = osnoise_trace; \
+ struct osnoise_instance *inst; \
+ struct trace_buffer *buffer; \
\
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_, msg); \
+ rcu_read_lock(); \
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) { \
+ buffer = inst->tr->array_buffer.buffer; \
+ trace_array_printk_buf(buffer, _THIS_IP_, msg); \
+ } \
+ rcu_read_unlock(); \
osnoise_data.tainted = true; \
})
/*
* Record an osnoise_sample into the tracer buffer.
*/
-static void trace_osnoise_sample(struct osnoise_sample *sample)
+static void
+__trace_osnoise_sample(struct osnoise_sample *sample, struct trace_buffer *buffer)
{
- struct trace_array *tr = osnoise_trace;
- struct trace_buffer *buffer = tr->array_buffer.buffer;
struct trace_event_call *call = &event_osnoise;
struct ring_buffer_event *event;
struct osnoise_entry *entry;
@@ -350,6 +493,22 @@ static void trace_osnoise_sample(struct osnoise_sample *sample)
trace_buffer_unlock_commit_nostack(buffer, event);
}
+/*
+ * Record an osnoise_sample on all osnoise instances.
+ */
+static void trace_osnoise_sample(struct osnoise_sample *sample)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __trace_osnoise_sample(sample, buffer);
+ }
+ rcu_read_unlock();
+}
+
#ifdef CONFIG_TIMERLAT_TRACER
/*
* Print the timerlat header info.
@@ -388,14 +547,10 @@ static void print_timerlat_headers(struct seq_file *s)
}
#endif /* CONFIG_PREEMPT_RT */
-/*
- * Record an timerlat_sample into the tracer buffer.
- */
-static void trace_timerlat_sample(struct timerlat_sample *sample)
+static void
+__trace_timerlat_sample(struct timerlat_sample *sample, struct trace_buffer *buffer)
{
- struct trace_array *tr = osnoise_trace;
struct trace_event_call *call = &event_osnoise;
- struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
struct timerlat_entry *entry;
@@ -412,6 +567,22 @@ static void trace_timerlat_sample(struct timerlat_sample *sample)
trace_buffer_unlock_commit_nostack(buffer, event);
}
+/*
+ * Record an timerlat_sample into the tracer buffer.
+ */
+static void trace_timerlat_sample(struct timerlat_sample *sample)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __trace_timerlat_sample(sample, buffer);
+ }
+ rcu_read_unlock();
+}
+
#ifdef CONFIG_STACKTRACE
#define MAX_CALLS 256
@@ -451,29 +622,18 @@ static void timerlat_save_stack(int skip)
return;
}
-/*
- * timerlat_dump_stack - dump a stack trace previously saved
- *
- * Dump a saved stack trace into the trace buffer.
- */
-static void timerlat_dump_stack(void)
+
+static void
+__timerlat_dump_stack(struct trace_buffer *buffer, struct trace_stack *fstack, unsigned int size)
{
struct trace_event_call *call = &event_osnoise;
- struct trace_array *tr = osnoise_trace;
- struct trace_buffer *buffer = tr->array_buffer.buffer;
struct ring_buffer_event *event;
- struct trace_stack *fstack;
struct stack_entry *entry;
- unsigned int size;
-
- preempt_disable_notrace();
- fstack = this_cpu_ptr(&trace_stack);
- size = fstack->stack_size;
event = trace_buffer_lock_reserve(buffer, TRACE_STACK, sizeof(*entry) + size,
tracing_gen_ctx());
if (!event)
- goto out;
+ return;
entry = ring_buffer_event_data(event);
@@ -482,12 +642,39 @@ static void timerlat_dump_stack(void)
if (!call_filter_check_discard(call, entry, buffer, event))
trace_buffer_unlock_commit_nostack(buffer, event);
+}
-out:
+/*
+ * timerlat_dump_stack - dump a stack trace previously saved
+ */
+static void timerlat_dump_stack(u64 latency)
+{
+ struct osnoise_instance *inst;
+ struct trace_buffer *buffer;
+ struct trace_stack *fstack;
+ unsigned int size;
+
+ /*
+ * trace only if latency > print_stack config, if enabled.
+ */
+ if (!osnoise_data.print_stack || osnoise_data.print_stack > latency)
+ return;
+
+ preempt_disable_notrace();
+ fstack = this_cpu_ptr(&trace_stack);
+ size = fstack->stack_size;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ buffer = inst->tr->array_buffer.buffer;
+ __timerlat_dump_stack(buffer, fstack, size);
+
+ }
+ rcu_read_unlock();
preempt_enable_notrace();
}
-#else
-#define timerlat_dump_stack() do {} while (0)
+#else /* CONFIG_STACKTRACE */
+#define timerlat_dump_stack(u64 latency) do {} while (0)
#define timerlat_save_stack(a) do {} while (0)
#endif /* CONFIG_STACKTRACE */
#endif /* CONFIG_TIMERLAT_TRACER */
@@ -867,21 +1054,9 @@ static void trace_softirq_exit_callback(void *data, unsigned int vec_nr)
if (!osn_var->sampling)
return;
-#ifdef CONFIG_TIMERLAT_TRACER
- /*
- * If the timerlat is enabled, but the irq handler did
- * not run yet enabling timerlat_tracer, do not trace.
- */
- if (unlikely(osnoise_data.timerlat_tracer)) {
- struct timerlat_variables *tlat_var;
- tlat_var = this_cpu_tmr_var();
- if (!tlat_var->tracing_thread) {
- osn_var->softirq.arrival_time = 0;
- osn_var->softirq.delta_start = 0;
+ if (unlikely(timerlat_enabled()))
+ if (!timerlat_softirq_exit(osn_var))
return;
- }
- }
-#endif
duration = get_int_safe_duration(osn_var, &osn_var->softirq.delta_start);
trace_softirq_noise(vec_nr, osn_var->softirq.arrival_time, duration);
@@ -975,17 +1150,9 @@ thread_exit(struct osnoise_variables *osn_var, struct task_struct *t)
if (!osn_var->sampling)
return;
-#ifdef CONFIG_TIMERLAT_TRACER
- if (osnoise_data.timerlat_tracer) {
- struct timerlat_variables *tlat_var;
- tlat_var = this_cpu_tmr_var();
- if (!tlat_var->tracing_thread) {
- osn_var->thread.delta_start = 0;
- osn_var->thread.arrival_time = 0;
+ if (unlikely(timerlat_enabled()))
+ if (!timerlat_thread_exit(osn_var))
return;
- }
- }
-#endif
duration = get_int_safe_duration(osn_var, &osn_var->thread.delta_start);
@@ -1078,12 +1245,37 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *
*/
static __always_inline void osnoise_stop_tracing(void)
{
- struct trace_array *tr = osnoise_trace;
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
- trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
- "stop tracing hit on cpu %d\n", smp_processor_id());
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
- tracer_tracing_off(tr);
+ tracer_tracing_off(tr);
+ }
+ rcu_read_unlock();
+}
+
+/*
+ * notify_new_max_latency - Notify a new max latency via fsnotify interface.
+ */
+static void notify_new_max_latency(u64 latency)
+{
+ struct osnoise_instance *inst;
+ struct trace_array *tr;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(inst, &osnoise_instances, list) {
+ tr = inst->tr;
+ if (tr->max_latency < latency) {
+ tr->max_latency = latency;
+ latency_fsnotify(tr);
+ }
+ }
+ rcu_read_unlock();
}
/*
@@ -1097,7 +1289,6 @@ static __always_inline void osnoise_stop_tracing(void)
static int run_osnoise(void)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- struct trace_array *tr = osnoise_trace;
u64 start, sample, last_sample;
u64 last_int_count, int_count;
s64 noise = 0, max_noise = 0;
@@ -1232,11 +1423,7 @@ static int run_osnoise(void)
trace_osnoise_sample(&s);
- /* Keep a running maximum ever recorded osnoise "latency" */
- if (max_noise > tr->max_latency) {
- tr->max_latency = max_noise;
- latency_fsnotify(tr);
- }
+ notify_new_max_latency(max_noise);
if (osnoise_data.stop_tracing_total)
if (s.noise > osnoise_data.stop_tracing_total)
@@ -1294,7 +1481,6 @@ static int osnoise_main(void *data)
static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
{
struct osnoise_variables *osn_var = this_cpu_osn_var();
- struct trace_array *tr = osnoise_trace;
struct timerlat_variables *tlat;
struct timerlat_sample s;
u64 now;
@@ -1333,9 +1519,11 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
* running, the thread needs to receive the softirq delta_start. The
* reason being is that the softirq will be the last to be unfolded,
* resseting the thread delay to zero.
+ *
+ * The PREEMPT_RT is a special case, though. As softirqs run as threads
+ * on RT, moving the thread is enough.
*/
-#ifndef CONFIG_PREEMPT_RT
- if (osn_var->softirq.delta_start) {
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && osn_var->softirq.delta_start) {
copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
&osn_var->softirq.delta_start);
@@ -1345,13 +1533,6 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
copy_int_safe_time(osn_var, &osn_var->thread.delta_start,
&osn_var->irq.delta_start);
}
-#else /* CONFIG_PREEMPT_RT */
- /*
- * The sofirqs run as threads on RT, so there is not need
- * to keep track of it.
- */
- copy_int_safe_time(osn_var, &osn_var->thread.delta_start, &osn_var->irq.delta_start);
-#endif /* CONFIG_PREEMPT_RT */
/*
* Compute the current time with the expected time.
@@ -1365,11 +1546,7 @@ static enum hrtimer_restart timerlat_irq(struct hrtimer *timer)
trace_timerlat_sample(&s);
- /* Keep a running maximum ever recorded os noise "latency" */
- if (diff > tr->max_latency) {
- tr->max_latency = diff;
- latency_fsnotify(tr);
- }
+ notify_new_max_latency(diff);
if (osnoise_data.stop_tracing)
if (time_to_us(diff) >= osnoise_data.stop_tracing)
@@ -1457,11 +1634,7 @@ static int timerlat_main(void *data)
trace_timerlat_sample(&s);
-#ifdef CONFIG_STACKTRACE
- if (osnoise_data.print_stack)
- if (osnoise_data.print_stack <= time_to_us(diff))
- timerlat_dump_stack();
-#endif /* CONFIG_STACKTRACE */
+ timerlat_dump_stack(time_to_us(diff));
tlat->tracing_thread = false;
if (osnoise_data.stop_tracing_total)
@@ -1474,6 +1647,11 @@ static int timerlat_main(void *data)
hrtimer_cancel(&tlat->timer);
return 0;
}
+#else /* CONFIG_TIMERLAT_TRACER */
+static int timerlat_main(void *data)
+{
+ return 0;
+}
#endif /* CONFIG_TIMERLAT_TRACER */
/*
@@ -1516,16 +1694,13 @@ static int start_kthread(unsigned int cpu)
void *main = osnoise_main;
char comm[24];
-#ifdef CONFIG_TIMERLAT_TRACER
- if (osnoise_data.timerlat_tracer) {
+ if (timerlat_enabled()) {
snprintf(comm, 24, "timerlat/%d", cpu);
main = timerlat_main;
} else {
snprintf(comm, 24, "osnoise/%d", cpu);
}
-#else
- snprintf(comm, 24, "osnoise/%d", cpu);
-#endif
+
kthread = kthread_create_on_cpu(main, NULL, cpu, comm);
if (IS_ERR(kthread)) {
@@ -1546,7 +1721,7 @@ static int start_kthread(unsigned int cpu)
* This starts the kernel thread that will look for osnoise on many
* cpus.
*/
-static int start_per_cpu_kthreads(struct trace_array *tr)
+static int start_per_cpu_kthreads(void)
{
struct cpumask *current_mask = &save_cpumask;
int retval = 0;
@@ -1554,13 +1729,9 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
cpus_read_lock();
/*
- * Run only on CPUs in which trace and osnoise are allowed to run.
- */
- cpumask_and(current_mask, tr->tracing_cpumask, &osnoise_cpumask);
- /*
- * And the CPU is online.
+ * Run only on online CPUs in which osnoise is allowed to run.
*/
- cpumask_and(current_mask, cpu_online_mask, current_mask);
+ cpumask_and(current_mask, cpu_online_mask, &osnoise_cpumask);
for_each_possible_cpu(cpu)
per_cpu(per_cpu_osnoise_var, cpu).kthread = NULL;
@@ -1581,13 +1752,11 @@ static int start_per_cpu_kthreads(struct trace_array *tr)
#ifdef CONFIG_HOTPLUG_CPU
static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
- struct trace_array *tr = osnoise_trace;
unsigned int cpu = smp_processor_id();
-
mutex_lock(&trace_types_lock);
- if (!osnoise_busy)
+ if (!osnoise_has_registered_instances())
goto out_unlock_trace;
mutex_lock(&interface_lock);
@@ -1596,9 +1765,6 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
goto out_unlock;
- if (!cpumask_test_cpu(cpu, tr->tracing_cpumask))
- goto out_unlock;
-
start_kthread(cpu);
out_unlock:
@@ -1687,9 +1853,6 @@ out_unlock:
return count;
}
-static void osnoise_tracer_start(struct trace_array *tr);
-static void osnoise_tracer_stop(struct trace_array *tr);
-
/*
* osnoise_cpus_write - Write function for "cpus" entry
* @filp: The active open file structure
@@ -1701,19 +1864,15 @@ static void osnoise_tracer_stop(struct trace_array *tr);
* interface to the osnoise trace. By default, it lists all CPUs,
* in this way, allowing osnoise threads to run on any online CPU
* of the system. It serves to restrict the execution of osnoise to the
- * set of CPUs writing via this interface. Note that osnoise also
- * respects the "tracing_cpumask." Hence, osnoise threads will run only
- * on the set of CPUs allowed here AND on "tracing_cpumask." Why not
- * have just "tracing_cpumask?" Because the user might be interested
- * in tracing what is running on other CPUs. For instance, one might
- * run osnoise in one HT CPU while observing what is running on the
- * sibling HT CPU.
+ * set of CPUs writing via this interface. Why not use "tracing_cpumask"?
+ * Because the user might be interested in tracing what is running on
+ * other CPUs. For instance, one might run osnoise in one HT CPU
+ * while observing what is running on the sibling HT CPU.
*/
static ssize_t
osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
loff_t *ppos)
{
- struct trace_array *tr = osnoise_trace;
cpumask_var_t osnoise_cpumask_new;
int running, err;
char buf[256];
@@ -1732,13 +1891,12 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
goto err_free;
/*
- * trace_types_lock is taken to avoid concurrency on start/stop
- * and osnoise_busy.
+ * trace_types_lock is taken to avoid concurrency on start/stop.
*/
mutex_lock(&trace_types_lock);
- running = osnoise_busy;
+ running = osnoise_has_registered_instances();
if (running)
- osnoise_tracer_stop(tr);
+ stop_per_cpu_kthreads();
mutex_lock(&interface_lock);
/*
@@ -1752,7 +1910,7 @@ osnoise_cpus_write(struct file *filp, const char __user *ubuf, size_t count,
mutex_unlock(&interface_lock);
if (running)
- osnoise_tracer_start(tr);
+ start_per_cpu_kthreads();
mutex_unlock(&trace_types_lock);
free_cpumask_var(osnoise_cpumask_new);
@@ -1836,6 +1994,47 @@ static const struct file_operations cpus_fops = {
.llseek = generic_file_llseek,
};
+#ifdef CONFIG_TIMERLAT_TRACER
+#ifdef CONFIG_STACKTRACE
+static int init_timerlat_stack_tracefs(struct dentry *top_dir)
+{
+ struct dentry *tmp;
+
+ tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir,
+ &osnoise_print_stack, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
+ return 0;
+}
+#else /* CONFIG_STACKTRACE */
+static int init_timerlat_stack_tracefs(struct dentry *top_dir)
+{
+ return 0;
+}
+#endif /* CONFIG_STACKTRACE */
+
+/*
+ * init_timerlat_tracefs - A function to initialize the timerlat interface files
+ */
+static int init_timerlat_tracefs(struct dentry *top_dir)
+{
+ struct dentry *tmp;
+
+ tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir,
+ &timerlat_period, &trace_min_max_fops);
+ if (!tmp)
+ return -ENOMEM;
+
+ return init_timerlat_stack_tracefs(top_dir);
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+static int init_timerlat_tracefs(struct dentry *top_dir)
+{
+ return 0;
+}
+#endif /* CONFIG_TIMERLAT_TRACER */
+
/*
* init_tracefs - A function to initialize the tracefs interface files
*
@@ -1880,19 +2079,10 @@ static int init_tracefs(void)
tmp = trace_create_file("cpus", TRACE_MODE_WRITE, top_dir, NULL, &cpus_fops);
if (!tmp)
goto err;
-#ifdef CONFIG_TIMERLAT_TRACER
-#ifdef CONFIG_STACKTRACE
- tmp = tracefs_create_file("print_stack", TRACE_MODE_WRITE, top_dir,
- &osnoise_print_stack, &trace_min_max_fops);
- if (!tmp)
- goto err;
-#endif
- tmp = tracefs_create_file("timerlat_period_us", TRACE_MODE_WRITE, top_dir,
- &timerlat_period, &trace_min_max_fops);
- if (!tmp)
+ ret = init_timerlat_tracefs(top_dir);
+ if (ret)
goto err;
-#endif
return 0;
@@ -1933,74 +2123,110 @@ out_unhook_irq:
return -EINVAL;
}
-static int __osnoise_tracer_start(struct trace_array *tr)
+/*
+ * osnoise_workload_start - start the workload and hook to events
+ */
+static int osnoise_workload_start(void)
{
int retval;
+ /*
+ * Instances need to be registered after calling workload
+ * start. Hence, if there is already an instance, the
+ * workload was already registered. Otherwise, this
+ * code is on the way to register the first instance,
+ * and the workload will start.
+ */
+ if (osnoise_has_registered_instances())
+ return 0;
+
osn_var_reset_all();
retval = osnoise_hook_events();
if (retval)
return retval;
+
/*
- * Make sure NMIs see reseted values.
+ * Make sure that ftrace_nmi_enter/exit() see reset values
+ * before enabling trace_osnoise_callback_enabled.
*/
barrier();
trace_osnoise_callback_enabled = true;
- retval = start_per_cpu_kthreads(tr);
+ retval = start_per_cpu_kthreads();
if (retval) {
unhook_irq_events();
return retval;
}
- osnoise_busy = true;
-
return 0;
}
+/*
+ * osnoise_workload_stop - stop the workload and unhook the events
+ */
+static void osnoise_workload_stop(void)
+{
+ /*
+ * Instances need to be unregistered before calling
+ * stop. Hence, if there is a registered instance, more
+ * than one instance is running, and the workload will not
+ * yet stop. Otherwise, this code is on the way to disable
+ * the last instance, and the workload can stop.
+ */
+ if (osnoise_has_registered_instances())
+ return;
+
+ trace_osnoise_callback_enabled = false;
+ /*
+ * Make sure that ftrace_nmi_enter/exit() see
+ * trace_osnoise_callback_enabled as false before continuing.
+ */
+ barrier();
+
+ stop_per_cpu_kthreads();
+
+ unhook_irq_events();
+ unhook_softirq_events();
+ unhook_thread_events();
+}
+
static void osnoise_tracer_start(struct trace_array *tr)
{
int retval;
- if (osnoise_busy)
+ /*
+ * If the instance is already registered, there is no need to
+ * register it again.
+ */
+ if (osnoise_instance_registered(tr))
return;
- retval = __osnoise_tracer_start(tr);
+ retval = osnoise_workload_start();
if (retval)
pr_err(BANNER "Error starting osnoise tracer\n");
+ osnoise_register_instance(tr);
}
static void osnoise_tracer_stop(struct trace_array *tr)
{
- if (!osnoise_busy)
- return;
-
- trace_osnoise_callback_enabled = false;
- barrier();
-
- stop_per_cpu_kthreads();
-
- unhook_irq_events();
- unhook_softirq_events();
- unhook_thread_events();
-
- osnoise_busy = false;
+ osnoise_unregister_instance(tr);
+ osnoise_workload_stop();
}
static int osnoise_tracer_init(struct trace_array *tr)
{
-
- /* Only allow one instance to enable this */
- if (osnoise_busy)
+ /*
+ * Only allow osnoise tracer if timerlat tracer is not running
+ * already.
+ */
+ if (timerlat_enabled())
return -EBUSY;
- osnoise_trace = tr;
tr->max_latency = 0;
osnoise_tracer_start(tr);
-
return 0;
}
@@ -2024,45 +2250,55 @@ static void timerlat_tracer_start(struct trace_array *tr)
{
int retval;
- if (osnoise_busy)
+ /*
+ * If the instance is already registered, there is no need to
+ * register it again.
+ */
+ if (osnoise_instance_registered(tr))
return;
- osnoise_data.timerlat_tracer = 1;
-
- retval = __osnoise_tracer_start(tr);
+ retval = osnoise_workload_start();
if (retval)
- goto out_err;
+ pr_err(BANNER "Error starting timerlat tracer\n");
+
+ osnoise_register_instance(tr);
return;
-out_err:
- pr_err(BANNER "Error starting timerlat tracer\n");
}
static void timerlat_tracer_stop(struct trace_array *tr)
{
int cpu;
- if (!osnoise_busy)
- return;
-
- for_each_online_cpu(cpu)
- per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
+ osnoise_unregister_instance(tr);
- osnoise_tracer_stop(tr);
+ /*
+ * Instruct the threads to stop only if this is the last instance.
+ */
+ if (!osnoise_has_registered_instances()) {
+ for_each_online_cpu(cpu)
+ per_cpu(per_cpu_osnoise_var, cpu).sampling = 0;
+ }
- osnoise_data.timerlat_tracer = 0;
+ osnoise_workload_stop();
}
static int timerlat_tracer_init(struct trace_array *tr)
{
- /* Only allow one instance to enable this */
- if (osnoise_busy)
+ /*
+ * Only allow timerlat tracer if osnoise tracer is not running already.
+ */
+ if (osnoise_has_registered_instances() && !osnoise_data.timerlat_tracer)
return -EBUSY;
- osnoise_trace = tr;
+ /*
+ * If this is the first instance, set timerlat_tracer to block
+ * osnoise tracer start.
+ */
+ if (!osnoise_has_registered_instances())
+ osnoise_data.timerlat_tracer = 1;
tr->max_latency = 0;
-
timerlat_tracer_start(tr);
return 0;
@@ -2071,6 +2307,13 @@ static int timerlat_tracer_init(struct trace_array *tr)
static void timerlat_tracer_reset(struct trace_array *tr)
{
timerlat_tracer_stop(tr);
+
+ /*
+ * If this is the last instance, reset timerlat_tracer allowing
+ * osnoise to be started.
+ */
+ if (!osnoise_has_registered_instances())
+ osnoise_data.timerlat_tracer = 0;
}
static struct tracer timerlat_tracer __read_mostly = {
@@ -2082,6 +2325,16 @@ static struct tracer timerlat_tracer __read_mostly = {
.print_header = print_timerlat_headers,
.allow_instances = true,
};
+
+__init static int init_timerlat_tracer(void)
+{
+ return register_tracer(&timerlat_tracer);
+}
+#else /* CONFIG_TIMERLAT_TRACER */
+__init static int init_timerlat_tracer(void)
+{
+ return 0;
+}
#endif /* CONFIG_TIMERLAT_TRACER */
__init static int init_osnoise_tracer(void)
@@ -2098,15 +2351,16 @@ __init static int init_osnoise_tracer(void)
return ret;
}
-#ifdef CONFIG_TIMERLAT_TRACER
- ret = register_tracer(&timerlat_tracer);
+ ret = init_timerlat_tracer();
if (ret) {
- pr_err(BANNER "Error registering timerlat\n");
+ pr_err(BANNER "Error registering timerlat!\n");
return ret;
}
-#endif
+
osnoise_init_hotplug_support();
+ INIT_LIST_HEAD_RCU(&osnoise_instances);
+
init_tracefs();
return 0;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 257ffb993ea2..f00de83d0246 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -137,7 +137,7 @@ static void __acct_update_integrals(struct task_struct *tsk,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
- tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
+ tsk->acct_vm_mem1 += delta * READ_ONCE(tsk->mm->total_vm) >> 10;
}
/**
diff --git a/kernel/ucount.c b/kernel/ucount.c
index eb03f3c68375..4f5613dac227 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -150,9 +150,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts)
spin_unlock_irq(&ucounts_lock);
}
+static inline bool get_ucounts_or_wrap(struct ucounts *ucounts)
+{
+ /* Returns true on a successful get, false if the count wraps. */
+ return !atomic_add_negative(1, &ucounts->count);
+}
+
struct ucounts *get_ucounts(struct ucounts *ucounts)
{
- if (ucounts && atomic_add_negative(1, &ucounts->count)) {
+ if (!get_ucounts_or_wrap(ucounts)) {
put_ucounts(ucounts);
ucounts = NULL;
}
@@ -163,7 +169,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new;
- long overflow;
+ bool wrapped;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -188,9 +194,9 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
return new;
}
}
- overflow = atomic_add_negative(1, &ucounts->count);
+ wrapped = !get_ucounts_or_wrap(ucounts);
spin_unlock_irq(&ucounts_lock);
- if (overflow) {
+ if (wrapped) {
put_ucounts(ucounts);
return NULL;
}
@@ -276,7 +282,7 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
struct ucounts *iter;
long new = -1; /* Silence compiler warning */
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
- long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+ long dec = atomic_long_sub_return(v, &iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
if (iter == ucounts)
new = dec;
@@ -289,7 +295,7 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
{
struct ucounts *iter, *next;
for (iter = ucounts; iter != last; iter = next) {
- long dec = atomic_long_add_return(-1, &iter->ucount[type]);
+ long dec = atomic_long_sub_return(1, &iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
next = iter->ns->ucounts;
if (dec == 0)
@@ -326,7 +332,7 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
}
return ret;
dec_unwind:
- dec = atomic_long_add_return(-1, &iter->ucount[type]);
+ dec = atomic_long_sub_return(1, &iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
unwind:
do_dec_rlimit_put_ucounts(ucounts, iter, type);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1b3eb1e9531f..613917bbc4e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -375,6 +375,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
static void show_pwq(struct pool_workqueue *pwq);
+static void show_one_worker_pool(struct worker_pool *pool);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -1350,7 +1351,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
struct worker_pool *pool = pwq->pool;
/* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack_noalloc(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
@@ -4447,7 +4448,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
raw_spin_unlock_irq(&pwq->pool->lock);
mutex_unlock(&wq->mutex);
mutex_unlock(&wq_pool_mutex);
- show_workqueue_state();
+ show_one_workqueue(wq);
return;
}
raw_spin_unlock_irq(&pwq->pool->lock);
@@ -4797,97 +4798,116 @@ static void show_pwq(struct pool_workqueue *pwq)
}
/**
- * show_workqueue_state - dump workqueue state
- *
- * Called from a sysrq handler or try_to_freeze_tasks() and prints out
- * all busy workqueues and pools.
+ * show_one_workqueue - dump state of specified workqueue
+ * @wq: workqueue whose state will be printed
*/
-void show_workqueue_state(void)
+void show_one_workqueue(struct workqueue_struct *wq)
{
- struct workqueue_struct *wq;
- struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+ bool idle = true;
unsigned long flags;
- int pi;
-
- rcu_read_lock();
-
- pr_info("Showing busy workqueues and worker pools:\n");
-
- list_for_each_entry_rcu(wq, &workqueues, list) {
- struct pool_workqueue *pwq;
- bool idle = true;
- for_each_pwq(pwq, wq) {
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
- idle = false;
- break;
- }
+ for_each_pwq(pwq, wq) {
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ idle = false;
+ break;
}
- if (idle)
- continue;
+ }
+ if (idle) /* Nothing to print for idle workqueue */
+ return;
- pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
+ pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags);
- for_each_pwq(pwq, wq) {
- raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
- /*
- * Defer printing to avoid deadlocks in console
- * drivers that queue work while holding locks
- * also taken in their write paths.
- */
- printk_deferred_enter();
- show_pwq(pwq);
- printk_deferred_exit();
- }
- raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
+ for_each_pwq(pwq, wq) {
+ raw_spin_lock_irqsave(&pwq->pool->lock, flags);
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
/*
- * We could be printing a lot from atomic context, e.g.
- * sysrq-t -> show_workqueue_state(). Avoid triggering
- * hard lockup.
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
*/
- touch_nmi_watchdog();
- }
- }
-
- for_each_pool(pool, pi) {
- struct worker *worker;
- bool first = true;
-
- raw_spin_lock_irqsave(&pool->lock, flags);
- if (pool->nr_workers == pool->nr_idle)
- goto next_pool;
- /*
- * Defer printing to avoid deadlocks in console drivers that
- * queue work while holding locks also taken in their write
- * paths.
- */
- printk_deferred_enter();
- pr_info("pool %d:", pool->id);
- pr_cont_pool_info(pool);
- pr_cont(" hung=%us workers=%d",
- jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
- pool->nr_workers);
- if (pool->manager)
- pr_cont(" manager: %d",
- task_pid_nr(pool->manager->task));
- list_for_each_entry(worker, &pool->idle_list, entry) {
- pr_cont(" %s%d", first ? "idle: " : "",
- task_pid_nr(worker->task));
- first = false;
+ printk_deferred_enter();
+ show_pwq(pwq);
+ printk_deferred_exit();
}
- pr_cont("\n");
- printk_deferred_exit();
- next_pool:
- raw_spin_unlock_irqrestore(&pool->lock, flags);
+ raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
- * sysrq-t -> show_workqueue_state(). Avoid triggering
+ * sysrq-t -> show_all_workqueues(). Avoid triggering
* hard lockup.
*/
touch_nmi_watchdog();
}
+}
+
+/**
+ * show_one_worker_pool - dump state of specified worker pool
+ * @pool: worker pool whose state will be printed
+ */
+static void show_one_worker_pool(struct worker_pool *pool)
+{
+ struct worker *worker;
+ bool first = true;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&pool->lock, flags);
+ if (pool->nr_workers == pool->nr_idle)
+ goto next_pool;
+ /*
+ * Defer printing to avoid deadlocks in console drivers that
+ * queue work while holding locks also taken in their write
+ * paths.
+ */
+ printk_deferred_enter();
+ pr_info("pool %d:", pool->id);
+ pr_cont_pool_info(pool);
+ pr_cont(" hung=%us workers=%d",
+ jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000,
+ pool->nr_workers);
+ if (pool->manager)
+ pr_cont(" manager: %d",
+ task_pid_nr(pool->manager->task));
+ list_for_each_entry(worker, &pool->idle_list, entry) {
+ pr_cont(" %s%d", first ? "idle: " : "",
+ task_pid_nr(worker->task));
+ first = false;
+ }
+ pr_cont("\n");
+ printk_deferred_exit();
+next_pool:
+ raw_spin_unlock_irqrestore(&pool->lock, flags);
+ /*
+ * We could be printing a lot from atomic context, e.g.
+ * sysrq-t -> show_all_workqueues(). Avoid triggering
+ * hard lockup.
+ */
+ touch_nmi_watchdog();
+
+}
+
+/**
+ * show_all_workqueues - dump workqueue state
+ *
+ * Called from a sysrq handler or try_to_freeze_tasks() and prints out
+ * all busy workqueues and pools.
+ */
+void show_all_workqueues(void)
+{
+ struct workqueue_struct *wq;
+ struct worker_pool *pool;
+ int pi;
+
+ rcu_read_lock();
+
+ pr_info("Showing busy workqueues and worker pools:\n");
+
+ list_for_each_entry_rcu(wq, &workqueues, list)
+ show_one_workqueue(wq);
+
+ for_each_pool(pool, pi)
+ show_one_worker_pool(pool);
+
rcu_read_unlock();
}
@@ -5384,9 +5404,6 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
int ret = -EINVAL;
cpumask_var_t saved_cpumask;
- if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL))
- return -ENOMEM;
-
/*
* Not excluding isolated cpus on purpose.
* If the user wishes to include them, we allow that.
@@ -5394,6 +5411,15 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
apply_wqattrs_lock();
+ if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
+ ret = 0;
+ goto out_unlock;
+ }
+
+ if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
/* save the old wq_unbound_cpumask. */
cpumask_copy(saved_cpumask, wq_unbound_cpumask);
@@ -5406,10 +5432,11 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
if (ret < 0)
cpumask_copy(wq_unbound_cpumask, saved_cpumask);
+ free_cpumask_var(saved_cpumask);
+out_unlock:
apply_wqattrs_unlock();
}
- free_cpumask_var(saved_cpumask);
return ret;
}
@@ -5869,7 +5896,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
rcu_read_unlock();
if (lockup_detected)
- show_workqueue_state();
+ show_all_workqueues();
wq_watchdog_reset_touched();
mod_timer(&wq_watchdog_timer, jiffies + thresh);