summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/arraymap.c1
-rw-r--r--kernel/bpf/bpf_struct_ops.c7
-rw-r--r--kernel/bpf/core.c26
-rw-r--r--kernel/bpf/disasm.c2
-rw-r--r--kernel/bpf/disasm.h2
-rw-r--r--kernel/bpf/stackmap.c13
-rw-r--r--kernel/bpf/syscall.c11
-rw-r--r--kernel/bpf/verifier.c4
-rw-r--r--kernel/cgroup/cgroup.c67
-rw-r--r--kernel/cgroup/cpuset.c56
-rw-r--r--kernel/cred.c9
-rw-r--r--kernel/dma/debug.c39
-rw-r--r--kernel/dma/debug.h24
-rw-r--r--kernel/dma/mapping.c27
-rw-r--r--kernel/entry/common.c4
-rw-r--r--kernel/events/core.c36
-rw-r--r--kernel/events/uprobes.c3
-rw-r--r--kernel/futex.c190
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwbase_rt.c65
-rw-r--r--kernel/locking/rwsem.c10
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/printk/printk.c4
-rw-r--r--kernel/rseq.c14
-rw-r--r--kernel/sched/core.c7
-rw-r--r--kernel/sched/debug.c8
-rw-r--r--kernel/sched/fair.c6
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/time/posix-cpu-timers.c3
-rw-r--r--kernel/trace/blktrace.c8
-rw-r--r--kernel/trace/ftrace.c22
-rw-r--r--kernel/trace/trace.c11
-rw-r--r--kernel/trace/trace_boot.c37
-rw-r--r--kernel/trace/trace_eprobe.c65
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/ucount.c49
-rw-r--r--kernel/workqueue.c18
40 files changed, 561 insertions, 326 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8dd73a64f921..b1cb1dbf7417 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -657,7 +657,7 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
break;
case AUDIT_SADDR_FAM:
- if (ctx->sockaddr)
+ if (ctx && ctx->sockaddr)
result = audit_comparator(ctx->sockaddr->ss_family,
f->op, f->val);
break;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cebd4fb06d19..447def540544 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr)
INIT_WORK(&aux->work, prog_array_map_clear_deferred);
INIT_LIST_HEAD(&aux->poke_progs);
mutex_init(&aux->poke_mutex);
+ spin_lock_init(&aux->owner.lock);
map = array_map_alloc(attr);
if (IS_ERR(map)) {
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d6731c32864e..9abcc33f02cf 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
+ u32 flags;
moff = btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
@@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
+ flags = st_ops->func_models[i].ret_size > 0 ?
+ BPF_TRAMP_F_RET_FENTRY_RET : 0;
err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
- &st_ops->func_models[i], 0,
- tprogs, NULL);
+ &st_ops->func_models[i],
+ flags, tprogs, NULL);
if (err < 0)
goto reset_unlock;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 9f4636d021b1..6e3ae90ad107 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -524,6 +524,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON);
int bpf_jit_harden __read_mostly;
long bpf_jit_limit __read_mostly;
+long bpf_jit_limit_max __read_mostly;
static void
bpf_prog_ksym_set_addr(struct bpf_prog *prog)
@@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void)
static int __init bpf_jit_charge_init(void)
{
/* Only used as heuristic here to derive limit. */
- bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2,
+ bpf_jit_limit_max = bpf_jit_alloc_exec_limit();
+ bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2,
PAGE_SIZE), LONG_MAX);
return 0;
}
@@ -827,7 +829,7 @@ int bpf_jit_charge_modmem(u32 pages)
{
if (atomic_long_add_return(pages, &bpf_jit_current) >
(bpf_jit_limit >> PAGE_SHIFT)) {
- if (!capable(CAP_SYS_ADMIN)) {
+ if (!bpf_capable()) {
atomic_long_sub(pages, &bpf_jit_current);
return -EPERM;
}
@@ -1821,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx,
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
{
+ bool ret;
+
if (fp->kprobe_override)
return false;
- if (!array->aux->type) {
+ spin_lock(&array->aux->owner.lock);
+
+ if (!array->aux->owner.type) {
/* There's no owner yet where we could check for
* compatibility.
*/
- array->aux->type = fp->type;
- array->aux->jited = fp->jited;
- return true;
+ array->aux->owner.type = fp->type;
+ array->aux->owner.jited = fp->jited;
+ ret = true;
+ } else {
+ ret = array->aux->owner.type == fp->type &&
+ array->aux->owner.jited == fp->jited;
}
-
- return array->aux->type == fp->type &&
- array->aux->jited == fp->jited;
+ spin_unlock(&array->aux->owner.lock);
+ return ret;
}
static int bpf_check_tail_call(const struct bpf_prog *fp)
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
index ca3cd9aaa6ce..7b4afb7d96db 100644
--- a/kernel/bpf/disasm.c
+++ b/kernel/bpf/disasm.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-only
+// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e546b18d27da..a4b040793f44 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
+/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
*/
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index e8eefdf8cf3e..6e75bbee39f0 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map)
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
- u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
+ u64 elem_size = sizeof(struct stack_map_bucket) +
+ (u64)smap->map.value_size;
int err;
smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
@@ -179,7 +180,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
* with build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
- !mmap_read_trylock_non_owner(current->mm)) {
+ !mmap_read_trylock(current->mm)) {
/* cannot access current->mm, fall back to ips */
for (i = 0; i < trace_nr; i++) {
id_offs[i].status = BPF_STACK_BUILD_ID_IP;
@@ -204,9 +205,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
}
if (!work) {
- mmap_read_unlock_non_owner(current->mm);
+ mmap_read_unlock(current->mm);
} else {
work->mm = current->mm;
+
+ /* The lock will be released once we're out of interrupt
+ * context. Tell lockdep that we've released it now so
+ * it doesn't complain that we forgot to release it.
+ */
+ rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
irq_work_queue(&work->irq_work);
}
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4e50c0bfdb7d..1cad6979a0d0 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
- type = array->aux->type;
- jited = array->aux->jited;
+ spin_lock(&array->aux->owner.lock);
+ type = array->aux->owner.type;
+ jited = array->aux->owner.jited;
+ spin_unlock(&array->aux->owner.lock);
}
seq_printf(m,
@@ -1337,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
u32 value_size, cp, max_count;
- int ufd = attr->map_fd;
+ int ufd = attr->batch.map_fd;
void *key, *value;
struct fd f;
int err = 0;
- f = fdget(ufd);
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -1367,6 +1368,7 @@ int generic_map_update_batch(struct bpf_map *map,
return -ENOMEM;
}
+ f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */
for (cp = 0; cp < max_count; cp++) {
err = -EFAULT;
if (copy_from_user(key, keys + cp * map->key_size,
@@ -1386,6 +1388,7 @@ int generic_map_update_batch(struct bpf_map *map,
kvfree(value);
kvfree(key);
+ fdput(f);
return err;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 047ac4b4703b..de006552be8a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env,
nr_linfo = attr->line_info_cnt;
if (!nr_linfo)
return 0;
+ if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info))
+ return -EINVAL;
rec_size = attr->line_info_rec_size;
if (rec_size < MIN_BPF_LINEINFO_SIZE ||
@@ -13317,7 +13319,7 @@ BTF_SET_START(btf_non_sleepable_error_inject)
/* Three functions below can be called from sleepable and non-sleepable context.
* Assume non-sleepable from bpf safety point of view.
*/
-BTF_ID(func, __add_to_page_cache_locked)
+BTF_ID(func, __filemap_add_folio)
BTF_ID(func, should_fail_alloc_page)
BTF_ID(func, should_failslab)
BTF_SET_END(btf_non_sleepable_error_inject)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 881ce1470beb..ea08f01d0111 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -2187,8 +2187,10 @@ static void cgroup_kill_sb(struct super_block *sb)
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -6572,74 +6574,51 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled) {
- skcd->no_refcnt = 1;
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
- if (skcd->val) {
- if (skcd->no_refcnt)
- return;
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- }
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
+
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- if (skcd->no_refcnt)
- return;
cgroup_bpf_put(cgrp);
cgroup_put(cgrp);
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index df1ccf4558f8..2a9695ccb65f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -311,17 +311,19 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_rwsem write lock. Other
+ * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
+ * prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
+ * just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
@@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void)
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
@@ -435,7 +437,7 @@ out_unlock:
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
+ * are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs)
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
+ * cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_mutex held. */
+/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -726,7 +728,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_mutex held.
+ * Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes cpus_read_lock().
+ * Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void)
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_cpumask(struct cpuset *cs)
@@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
{
@@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_mutex */
+ static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
@@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
+ * the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_lock during call.
+ * Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
+ * function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1980,7 +1982,7 @@ out:
* cs: the cpuset to update
* new_prs: new partition root state
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
@@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
+ /* static buf protected by cpuset_rwsem */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
@@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
@@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/cred.c b/kernel/cred.c
index f784e08c2fbd..1ae0b4948a5a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -225,8 +225,6 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC;
#endif
- new->ucounts = get_ucounts(&init_ucounts);
-
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error;
@@ -501,7 +499,7 @@ int commit_creds(struct cred *new)
inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
- if (new->user != old->user)
+ if (new->user != old->user || new->user_ns != old->user_ns)
dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2);
@@ -669,7 +667,7 @@ int set_cred_ucounts(struct cred *new)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
- struct ucounts *old_ucounts = new->ucounts;
+ struct ucounts *new_ucounts, *old_ucounts = new->ucounts;
if (new->user == old->user && new->user_ns == old->user_ns)
return 0;
@@ -681,9 +679,10 @@ int set_cred_ucounts(struct cred *new)
if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
return 0;
- if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
+ if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid)))
return -EAGAIN;
+ new->ucounts = new_ucounts;
if (old_ucounts)
put_ucounts(old_ucounts);
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index 6c90c69e5311..7a14ca29c377 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -552,7 +552,7 @@ static void active_cacheline_remove(struct dma_debug_entry *entry)
* Wrapper function for adding an entry to the hash.
* This function takes care of locking itself.
*/
-static void add_dma_entry(struct dma_debug_entry *entry)
+static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs)
{
struct hash_bucket *bucket;
unsigned long flags;
@@ -566,8 +566,9 @@ static void add_dma_entry(struct dma_debug_entry *entry)
if (rc == -ENOMEM) {
pr_err("cacheline tracking ENOMEM, dma-debug disabled\n");
global_disable = true;
- } else if (rc == -EEXIST) {
- pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n");
+ } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) {
+ err_printk(entry->dev, entry,
+ "cacheline tracking EEXIST, overlapping mappings aren't supported\n");
}
}
@@ -1190,7 +1191,8 @@ void debug_dma_map_single(struct device *dev, const void *addr,
EXPORT_SYMBOL(debug_dma_map_single);
void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
- size_t size, int direction, dma_addr_t dma_addr)
+ size_t size, int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1221,7 +1223,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset,
check_for_illegal_area(dev, addr, size);
}
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
@@ -1279,7 +1281,8 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
struct scatterlist *s;
@@ -1288,6 +1291,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
if (unlikely(dma_debug_disabled()))
return;
+ for_each_sg(sg, s, nents, i) {
+ check_for_stack(dev, sg_page(s), s->offset);
+ if (!PageHighMem(sg_page(s)))
+ check_for_illegal_area(dev, sg_virt(s), s->length);
+ }
+
for_each_sg(sg, s, mapped_ents, i) {
entry = dma_entry_alloc();
if (!entry)
@@ -1303,15 +1312,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
entry->sg_call_ents = nents;
entry->sg_mapped_ents = mapped_ents;
- check_for_stack(dev, sg_page(s), s->offset);
-
- if (!PageHighMem(sg_page(s))) {
- check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
- }
-
check_sg_segment(dev, s);
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
}
@@ -1367,7 +1370,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
}
void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1397,7 +1401,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size,
else
entry->pfn = page_to_pfn(virt_to_page(virt));
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_free_coherent(struct device *dev, size_t size,
@@ -1428,7 +1432,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size,
}
void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
struct dma_debug_entry *entry;
@@ -1448,7 +1453,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size,
entry->direction = direction;
entry->map_err_type = MAP_ERR_NOT_CHECKED;
- add_dma_entry(entry);
+ add_dma_entry(entry, attrs);
}
void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h
index 83643b3010b2..f525197d3cae 100644
--- a/kernel/dma/debug.h
+++ b/kernel/dma/debug.h
@@ -11,26 +11,30 @@
#ifdef CONFIG_DMA_API_DEBUG
extern void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr);
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
size_t size, int direction);
extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction);
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs);
extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
int nelems, int dir);
extern void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt);
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs);
extern void debug_dma_free_coherent(struct device *dev, size_t size,
void *virt, dma_addr_t addr);
extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr);
+ dma_addr_t dma_addr,
+ unsigned long attrs);
extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr,
size_t size, int direction);
@@ -53,7 +57,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev,
#else /* CONFIG_DMA_API_DEBUG */
static inline void debug_dma_map_page(struct device *dev, struct page *page,
size_t offset, size_t size,
- int direction, dma_addr_t dma_addr)
+ int direction, dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
@@ -63,7 +68,8 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr,
}
static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
- int nents, int mapped_ents, int direction)
+ int nents, int mapped_ents, int direction,
+ unsigned long attrs)
{
}
@@ -74,7 +80,8 @@ static inline void debug_dma_unmap_sg(struct device *dev,
}
static inline void debug_dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t dma_addr, void *virt)
+ dma_addr_t dma_addr, void *virt,
+ unsigned long attrs)
{
}
@@ -85,7 +92,8 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size,
static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr,
size_t size, int direction,
- dma_addr_t dma_addr)
+ dma_addr_t dma_addr,
+ unsigned long attrs)
{
}
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 7ee5284bff58..8349a9f2c345 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -156,7 +156,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page,
addr = dma_direct_map_page(dev, page, offset, size, dir, attrs);
else
addr = ops->map_page(dev, page, offset, size, dir, attrs);
- debug_dma_map_page(dev, page, offset, size, dir, addr);
+ debug_dma_map_page(dev, page, offset, size, dir, addr, attrs);
return addr;
}
@@ -195,7 +195,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
ents = ops->map_sg(dev, sg, nents, dir, attrs);
if (ents > 0)
- debug_dma_map_sg(dev, sg, nents, ents, dir);
+ debug_dma_map_sg(dev, sg, nents, ents, dir, attrs);
else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
ents != -EIO))
return -EIO;
@@ -206,7 +206,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
/**
* dma_map_sg_attrs - Map the given buffer for DMA
* @dev: The device for which to perform the DMA operation
- * @sg: The sg_table object describing the buffer
+ * @sg: The sg_table object describing the buffer
+ * @nents: Number of entries to map
* @dir: DMA direction
* @attrs: Optional DMA attributes for the map operation
*
@@ -248,12 +249,12 @@ EXPORT_SYMBOL(dma_map_sg_attrs);
* Returns 0 on success or a negative error code on error. The following
* error codes are supported with the given meaning:
*
- * -EINVAL - An invalid argument, unaligned access or other error
- * in usage. Will not succeed if retried.
- * -ENOMEM - Insufficient resources (like memory or IOVA space) to
- * complete the mapping. Should succeed if retried later.
- * -EIO - Legacy error code with an unknown meaning. eg. this is
- * returned if a lower level call returned DMA_MAPPING_ERROR.
+ * -EINVAL An invalid argument, unaligned access or other error
+ * in usage. Will not succeed if retried.
+ * -ENOMEM Insufficient resources (like memory or IOVA space) to
+ * complete the mapping. Should succeed if retried later.
+ * -EIO Legacy error code with an unknown meaning. eg. this is
+ * returned if a lower level call returned DMA_MAPPING_ERROR.
*/
int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
enum dma_data_direction dir, unsigned long attrs)
@@ -304,7 +305,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
else if (ops->map_resource)
addr = ops->map_resource(dev, phys_addr, size, dir, attrs);
- debug_dma_map_resource(dev, phys_addr, size, dir, addr);
+ debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs);
return addr;
}
EXPORT_SYMBOL(dma_map_resource);
@@ -509,7 +510,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle,
else
return NULL;
- debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
+ debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs);
return cpu_addr;
}
EXPORT_SYMBOL(dma_alloc_attrs);
@@ -565,7 +566,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size,
struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp);
if (page)
- debug_dma_map_page(dev, page, 0, size, dir, *dma_handle);
+ debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0);
return page;
}
EXPORT_SYMBOL_GPL(dma_alloc_pages);
@@ -643,7 +644,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size,
if (sgt) {
sgt->nents = 1;
- debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir);
+ debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs);
}
return sgt;
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index bf16395b9e13..d5a61d565ad5 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
handle_signal_work(regs, ti_work);
- if (ti_work & _TIF_NOTIFY_RESUME) {
+ if (ti_work & _TIF_NOTIFY_RESUME)
tracehook_notify_resume(regs);
- rseq_handle_notify_resume(NULL, regs);
- }
/* Architecture specific TIF work */
arch_exit_to_user_mode_work(regs, ti_work);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 744e8726c5b2..f23ca260307f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx,
return 0;
}
+static inline bool event_update_userpage(struct perf_event *event)
+{
+ if (likely(!atomic_read(&event->mmap_count)))
+ return false;
+
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
+ perf_event_update_userpage(event);
+
+ return true;
+}
+
+static inline void group_update_userpage(struct perf_event *group_event)
+{
+ struct perf_event *event;
+
+ if (!event_update_userpage(group_event))
+ return;
+
+ for_each_sibling_event(event, group_event)
+ event_update_userpage(event);
+}
+
static int merge_sched_in(struct perf_event *event, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data)
}
if (event->state == PERF_EVENT_STATE_INACTIVE) {
+ *can_add_hw = 0;
if (event->attr.pinned) {
perf_cgroup_event_disable(event, ctx);
perf_event_set_state(event, PERF_EVENT_STATE_ERROR);
+ } else {
+ ctx->rotate_necessary = 1;
+ perf_mux_hrtimer_restart(cpuctx);
+ group_update_userpage(event);
}
-
- *can_add_hw = 0;
- ctx->rotate_necessary = 1;
- perf_mux_hrtimer_restart(cpuctx);
}
return 0;
@@ -6324,6 +6348,8 @@ accounting:
ring_buffer_attach(event, rb);
+ perf_event_update_time(event);
+ perf_set_shadow_time(event, event->ctx);
perf_event_init_userpage(event);
perf_event_update_userpage(event);
} else {
@@ -10193,7 +10219,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
return;
if (ifh->nr_file_filters) {
- mm = get_task_mm(event->ctx->task);
+ mm = get_task_mm(task);
if (!mm)
goto restart;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index af24dc3febbe..6357c3580d07 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
- err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
+ err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
+ GFP_KERNEL);
if (err)
return err;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index e7b4c6121da4..c15ad276fd15 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1263,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval,
return -ESRCH;
}
+static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ /*
+ * No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
+ */
+ struct futex_pi_state *pi_state = alloc_pi_state();
+
+ /*
+ * Initialize the pi_mutex in locked state and make @p
+ * the owner of it:
+ */
+ rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
+
+ /* Store the key for possible exit cleanups: */
+ pi_state->key = *key;
+
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
+ pi_state->owner = p;
+
+ *ps = pi_state;
+}
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
@@ -1272,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
struct task_struct **exiting)
{
pid_t pid = uval & FUTEX_TID_MASK;
- struct futex_pi_state *pi_state;
struct task_struct *p;
/*
@@ -1324,36 +1353,11 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
return ret;
}
- /*
- * No existing pi state. First waiter. [2]
- *
- * This creates pi_state, we have hb->lock held, this means nothing can
- * observe this state, wait_lock is irrelevant.
- */
- pi_state = alloc_pi_state();
-
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
-
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
-
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- /*
- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
- * because there is no concurrency as the object is not published yet.
- */
- pi_state->owner = p;
+ __attach_to_pi_owner(p, key, ps);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
- *ps = pi_state;
-
return 0;
}
@@ -1454,8 +1458,26 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
newval |= FUTEX_WAITERS;
ret = lock_pi_update_atomic(uaddr, uval, newval);
- /* If the take over worked, return 1 */
- return ret < 0 ? ret : 1;
+ if (ret)
+ return ret;
+
+ /*
+ * If the waiter bit was requested the caller also needs PI
+ * state attached to the new owner of the user space futex.
+ *
+ * @task is guaranteed to be alive and it cannot be exiting
+ * because it is either sleeping or waiting in
+ * futex_requeue_pi_wakeup_sync().
+ *
+ * No need to do the full attach_to_pi_owner() exercise
+ * because @task is known and valid.
+ */
+ if (set_waiters) {
+ raw_spin_lock_irq(&task->pi_lock);
+ __attach_to_pi_owner(task, key, ps);
+ raw_spin_unlock_irq(&task->pi_lock);
+ }
+ return 1;
}
/*
@@ -1939,12 +1961,26 @@ static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q)
* @hb: the hash_bucket of the requeue target futex
*
* During futex_requeue, with requeue_pi=1, it is possible to acquire the
- * target futex if it is uncontended or via a lock steal. Set the futex_q key
- * to the requeue target futex so the waiter can detect the wakeup on the right
- * futex, but remove it from the hb and NULL the rt_waiter so it can detect
- * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock
- * to protect access to the pi_state to fixup the owner later. Must be called
- * with both q->lock_ptr and hb->lock held.
+ * target futex if it is uncontended or via a lock steal.
+ *
+ * 1) Set @q::key to the requeue target futex key so the waiter can detect
+ * the wakeup on the right futex.
+ *
+ * 2) Dequeue @q from the hash bucket.
+ *
+ * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock
+ * acquisition.
+ *
+ * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that
+ * the waiter has to fixup the pi state.
+ *
+ * 5) Complete the requeue state so the waiter can make progress. After
+ * this point the waiter task can return from the syscall immediately in
+ * case that the pi state does not have to be fixed up.
+ *
+ * 6) Wake the waiter task.
+ *
+ * Must be called with both q->lock_ptr and hb->lock held.
*/
static inline
void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
@@ -1998,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
{
struct futex_q *top_waiter = NULL;
u32 curval;
- int ret, vpid;
+ int ret;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
@@ -2025,7 +2061,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
* and waiting on the 'waitqueue' futex which is always !PI.
*/
if (!top_waiter->rt_waiter || top_waiter->pi_state)
- ret = -EINVAL;
+ return -EINVAL;
/* Ensure we requeue to the expected futex. */
if (!match_futex(top_waiter->requeue_pi_key, key2))
@@ -2036,17 +2072,23 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
return -EAGAIN;
/*
- * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
- * the contended case or if set_waiters is 1. The pi_state is returned
- * in ps in contended cases.
+ * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
+ * in the contended case or if @set_waiters is true.
+ *
+ * In the contended case PI state is attached to the lock owner. If
+ * the user space lock can be acquired then PI state is attached to
+ * the new owner (@top_waiter->task) when @set_waiters is true.
*/
- vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
exiting, set_waiters);
if (ret == 1) {
- /* Dequeue, wake up and update top_waiter::requeue_state */
+ /*
+ * Lock was acquired in user space and PI state was
+ * attached to @top_waiter->task. That means state is fully
+ * consistent and the waiter can return to user space
+ * immediately after the wakeup.
+ */
requeue_pi_wake_futex(top_waiter, key2, hb2);
- return vpid;
} else if (ret < 0) {
/* Rewind top_waiter::requeue_state */
futex_requeue_pi_complete(top_waiter, ret);
@@ -2208,19 +2250,26 @@ retry_private:
&exiting, nr_requeue);
/*
- * At this point the top_waiter has either taken uaddr2 or is
- * waiting on it. If the former, then the pi_state will not
- * exist yet, look it up one more time to ensure we have a
- * reference to it. If the lock was taken, @ret contains the
- * VPID of the top waiter task.
- * If the lock was not taken, we have pi_state and an initial
- * refcount on it. In case of an error we have nothing.
+ * At this point the top_waiter has either taken uaddr2 or
+ * is waiting on it. In both cases pi_state has been
+ * established and an initial refcount on it. In case of an
+ * error there's nothing.
*
* The top waiter's requeue_state is up to date:
*
- * - If the lock was acquired atomically (ret > 0), then
+ * - If the lock was acquired atomically (ret == 1), then
* the state is Q_REQUEUE_PI_LOCKED.
*
+ * The top waiter has been dequeued and woken up and can
+ * return to user space immediately. The kernel/user
+ * space state is consistent. In case that there must be
+ * more waiters requeued the WAITERS bit in the user
+ * space futex is set so the top waiter task has to go
+ * into the syscall slowpath to unlock the futex. This
+ * will block until this requeue operation has been
+ * completed and the hash bucket locks have been
+ * dropped.
+ *
* - If the trylock failed with an error (ret < 0) then
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
* happened", or Q_REQUEUE_PI_IGNORE when there was an
@@ -2234,36 +2283,20 @@ retry_private:
* the same sanity checks for requeue_pi as the loop
* below does.
*/
- if (ret > 0) {
- WARN_ON(pi_state);
- task_count++;
- /*
- * If futex_proxy_trylock_atomic() acquired the
- * user space futex, then the user space value
- * @uaddr2 has been set to the @hb1's top waiter
- * task VPID. This task is guaranteed to be alive
- * and cannot be exiting because it is either
- * sleeping or blocked on @hb2 lock.
- *
- * The @uaddr2 futex cannot have waiters either as
- * otherwise futex_proxy_trylock_atomic() would not
- * have succeeded.
- *
- * In order to requeue waiters to @hb2, pi state is
- * required. Hand in the VPID value (@ret) and
- * allocate PI state with an initial refcount on
- * it.
- */
- ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
- &exiting);
- WARN_ON(ret);
- }
-
switch (ret) {
case 0:
/* We hold a reference on the pi state. */
break;
+ case 1:
+ /*
+ * futex_proxy_trylock_atomic() acquired the user space
+ * futex. Adjust task_count.
+ */
+ task_count++;
+ ret = 0;
+ break;
+
/*
* If the above failed, then pi_state is NULL and
* waiter::requeue_state is correct.
@@ -2395,9 +2428,8 @@ retry_private:
}
/*
- * We took an extra initial reference to the pi_state either in
- * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need
- * to drop it here again.
+ * We took an extra initial reference to the pi_state in
+ * futex_proxy_trylock_atomic(). We need to drop it here again.
*/
put_pi_state(pi_state);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 19e83e9b723c..4d8fc65cf38f 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode);
* Allocates and initializes an irq_domain structure.
* Returns pointer to IRQ domain, or NULL on failure.
*/
-struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
+struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size,
irq_hw_number_t hwirq_max, int direct_max,
const struct irq_domain_ops *ops,
void *host_data)
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8eabdc79602b..6bb116c559b4 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task,
* other configuration and we fail to report; also, see
* lockdep.
*/
- if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx)
+ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx)
ret = 0;
raw_spin_unlock(&lock->wait_lock);
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 4ba15088e640..88191f6e252c 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -41,6 +41,12 @@
* The risk of writer starvation is there, but the pathological use cases
* which trigger it are not necessarily the typical RT workloads.
*
+ * Fast-path orderings:
+ * The lock/unlock of readers can run in fast paths: lock and unlock are only
+ * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE
+ * semantics of rwbase_rt. Atomic ops should thus provide _acquire()
+ * and _release() (or stronger).
+ *
* Common code shared between RT rw_semaphore and rwlock
*/
@@ -53,6 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb)
* set.
*/
for (r = atomic_read(&rwb->readers); r < 0;) {
+ /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */
if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1)))
return 1;
}
@@ -162,6 +169,8 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb,
/*
* rwb->readers can only hit 0 when a writer is waiting for the
* active readers to leave the critical section.
+ *
+ * dec_and_test() is fully ordered, provides RELEASE.
*/
if (unlikely(atomic_dec_and_test(&rwb->readers)))
__rwbase_read_unlock(rwb, state);
@@ -172,7 +181,11 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias,
{
struct rt_mutex_base *rtm = &rwb->rtmutex;
- atomic_add(READER_BIAS - bias, &rwb->readers);
+ /*
+ * _release() is needed in case that reader is in fast path, pairing
+ * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE
+ */
+ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers);
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
rwbase_rtmutex_unlock(rtm);
}
@@ -196,6 +209,23 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb)
__rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags);
}
+static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb)
+{
+ /* Can do without CAS because we're serialized by wait_lock. */
+ lockdep_assert_held(&rwb->rtmutex.wait_lock);
+
+ /*
+ * _acquire is needed in case the reader is in the fast path, pairing
+ * with rwbase_read_unlock(), provides ACQUIRE.
+ */
+ if (!atomic_read_acquire(&rwb->readers)) {
+ atomic_set(&rwb->readers, WRITER_BIAS);
+ return 1;
+ }
+
+ return 0;
+}
+
static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
unsigned int state)
{
@@ -210,34 +240,30 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
atomic_sub(READER_BIAS, &rwb->readers);
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
- /*
- * set_current_state() for rw_semaphore
- * current_save_and_set_rtlock_wait_state() for rwlock
- */
- rwbase_set_and_save_current_state(state);
+ if (__rwbase_write_trylock(rwb))
+ goto out_unlock;
- /* Block until all readers have left the critical section. */
- for (; atomic_read(&rwb->readers);) {
+ rwbase_set_and_save_current_state(state);
+ for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {
- __set_current_state(TASK_RUNNING);
+ rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
return -EINTR;
}
+
+ if (__rwbase_write_trylock(rwb))
+ break;
+
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+ rwbase_schedule();
+ raw_spin_lock_irqsave(&rtm->wait_lock, flags);
- /*
- * Schedule and wait for the readers to leave the critical
- * section. The last reader leaving it wakes the waiter.
- */
- if (atomic_read(&rwb->readers) != 0)
- rwbase_schedule();
set_current_state(state);
- raw_spin_lock_irqsave(&rtm->wait_lock, flags);
}
-
- atomic_set(&rwb->readers, WRITER_BIAS);
rwbase_restore_current_state();
+
+out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
return 0;
}
@@ -253,8 +279,7 @@ static inline int rwbase_write_trylock(struct rwbase_rt *rwb)
atomic_sub(READER_BIAS, &rwb->readers);
raw_spin_lock_irqsave(&rtm->wait_lock, flags);
- if (!atomic_read(&rwb->readers)) {
- atomic_set(&rwb->readers, WRITER_BIAS);
+ if (__rwbase_write_trylock(rwb)) {
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
return 1;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 9215b4d6a9de..000e8d5a2884 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
#include "rwbase_rt.c"
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-void __rwsem_init(struct rw_semaphore *sem, const char *name,
+void __init_rwsem(struct rw_semaphore *sem, const char *name,
struct lock_class_key *key)
{
+ init_rwbase_rt(&(sem)->rwbase);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP);
-}
-EXPORT_SYMBOL(__rwsem_init);
#endif
+}
+EXPORT_SYMBOL(__init_rwsem);
static inline void __down_read(struct rw_semaphore *sem)
{
diff --git a/kernel/module.c b/kernel/module.c
index 40ec9a030eec..5c26a76e800b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -4489,8 +4489,10 @@ static void cfi_init(struct module *mod)
/* Fix init/exit functions to point to the CFI jump table */
if (init)
mod->init = *init;
+#ifdef CONFIG_MODULE_UNLOAD
if (exit)
mod->exit = *exit;
+#endif
cfi_module_add(mod, module_addr_min);
#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 825277e1e742..a8d0a58deebc 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early)
return;
err_free_descs:
- memblock_free(__pa(new_descs), new_descs_size);
+ memblock_free_ptr(new_descs, new_descs_size);
err_free_log_buf:
- memblock_free(__pa(new_log_buf), new_log_buf_len);
+ memblock_free_ptr(new_log_buf, new_log_buf_len);
}
static bool __read_mostly ignore_loglevel;
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 35f7bd0fced0..6d45ac3dae7f 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
if (unlikely(t->flags & PF_EXITING))
return;
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+
+ /*
+ * regs is NULL if and only if the caller is in a syscall path. Skip
+ * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
+ * kill a misbehaving userspace on debug kernels.
+ */
+ if (regs) {
+ ret = rseq_ip_fixup(regs);
+ if (unlikely(ret < 0))
+ goto error;
+ }
if (unlikely(rseq_update_cpu_id(t)))
goto error;
return;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c4462c454ab9..f21714ea3db8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8795,6 +8795,7 @@ void idle_task_exit(void)
finish_arch_post_lock_switch();
}
+ scs_task_reset(current);
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
}
@@ -8836,7 +8837,6 @@ static void balance_push(struct rq *rq)
struct task_struct *push_task = rq->curr;
lockdep_assert_rq_held(rq);
- SCHED_WARN_ON(rq->cpu != smp_processor_id());
/*
* Ensure the thing is persistent until balance_push_set(.on = false);
@@ -8844,9 +8844,10 @@ static void balance_push(struct rq *rq)
rq->balance_callback = &balance_push_callback;
/*
- * Only active while going offline.
+ * Only active while going offline and when invoked on the outgoing
+ * CPU.
*/
- if (!cpu_dying(rq->cpu))
+ if (!cpu_dying(rq->cpu) || rq != this_rq())
return;
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 49716228efb4..17a653b67006 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[16];
+ unsigned int scaling;
if (cnt > 15)
cnt = 15;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = '\0';
- if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
+ if (kstrtouint(buf, 10, &scaling))
return -EINVAL;
+ if (scaling >= SCHED_TUNABLESCALING_END)
+ return -EINVAL;
+
+ sysctl_sched_tunable_scaling = scaling;
if (sched_update_scaling())
return -EINVAL;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ff69f245b939..f6a05d9b5443 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4936,8 +4936,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- if (!cfs_rq->load.weight)
+ /* Nothing to run but something to decay (on_list)? Complete the branch */
+ if (!cfs_rq->load.weight) {
+ if (cfs_rq->on_list)
+ goto unthrottle_throttle;
return;
+ }
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 912b47aa99d8..d17b0a5ce6ac 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
cpuidle_use_deepest_state(latency_ns);
it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
it.timer.function = idle_inject_timer_fn;
hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_REL_PINNED_HARD);
while (!READ_ONCE(it.done))
do_idle();
diff --git a/kernel/signal.c b/kernel/signal.c
index 952741f6d0f9..487bf4f5dadf 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -426,22 +426,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
*/
rcu_read_lock();
ucounts = task_ucounts(t);
- sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
- switch (sigpending) {
- case 1:
- if (likely(get_ucounts(ucounts)))
- break;
- fallthrough;
- case LONG_MAX:
- /*
- * we need to decrease the ucount in the userns tree on any
- * failure to avoid counts leaking.
- */
- dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
- rcu_read_unlock();
- return NULL;
- }
+ sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock();
+ if (!sigpending)
+ return NULL;
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
@@ -450,8 +438,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
}
if (unlikely(q == NULL)) {
- if (dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
- put_ucounts(ucounts);
+ dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = sigqueue_flags;
@@ -464,8 +451,8 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
- if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
- put_ucounts(q->ucounts);
+ if (q->ucounts) {
+ dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING);
q->ucounts = NULL;
}
kmem_cache_free(sigqueue_cachep, q);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ee736861b18f..643d412ac623 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1404,7 +1404,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid,
}
}
- *newval += now;
+ if (*newval)
+ *newval += now;
}
/*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c221e4c3f625..fa91f398f28b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1605,6 +1605,14 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+ }
+
put_probe_ref();
synchronize_rcu();
blk_trace_free(bt);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7efbc8aaf7f6..feebf57c6458 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2208,7 +2208,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update)
}
/**
- * ftrace_update_record, set a record that now is tracing or not
+ * ftrace_update_record - set a record that now is tracing or not
* @rec: the record to update
* @enable: set to true if the record is tracing, false to force disable
*
@@ -2221,7 +2221,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable)
}
/**
- * ftrace_test_record, check if the record has been enabled or not
+ * ftrace_test_record - check if the record has been enabled or not
* @rec: the record to test
* @enable: set to true to check if enabled, false if it is disabled
*
@@ -2574,7 +2574,7 @@ struct ftrace_rec_iter {
};
/**
- * ftrace_rec_iter_start, start up iterating over traced functions
+ * ftrace_rec_iter_start - start up iterating over traced functions
*
* Returns an iterator handle that is used to iterate over all
* the records that represent address locations where functions
@@ -2605,7 +2605,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void)
}
/**
- * ftrace_rec_iter_next, get the next record to process.
+ * ftrace_rec_iter_next - get the next record to process.
* @iter: The handle to the iterator.
*
* Returns the next iterator after the given iterator @iter.
@@ -2630,7 +2630,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter)
}
/**
- * ftrace_rec_iter_record, get the record at the iterator location
+ * ftrace_rec_iter_record - get the record at the iterator location
* @iter: The current iterator location
*
* Returns the record that the current @iter is at.
@@ -2733,7 +2733,7 @@ static int __ftrace_modify_code(void *data)
}
/**
- * ftrace_run_stop_machine, go back to the stop machine method
+ * ftrace_run_stop_machine - go back to the stop machine method
* @command: The command to tell ftrace what to do
*
* If an arch needs to fall back to the stop machine method, the
@@ -2745,7 +2745,7 @@ void ftrace_run_stop_machine(int command)
}
/**
- * arch_ftrace_update_code, modify the code to trace or not trace
+ * arch_ftrace_update_code - modify the code to trace or not trace
* @command: The command that needs to be done
*
* Archs can override this function if it does not need to
@@ -6977,7 +6977,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
@@ -7052,7 +7052,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
{
int bit;
- bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START);
if (bit < 0)
return;
@@ -7525,7 +7525,9 @@ void ftrace_kill(void)
}
/**
- * Test if ftrace is dead or not.
+ * ftrace_is_dead - Test if ftrace is dead or not.
+ *
+ * Returns 1 if ftrace is "dead", zero otherwise.
*/
int ftrace_is_dead(void)
{
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7896d30d90f7..bc677cd64224 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1744,16 +1744,15 @@ void latency_fsnotify(struct trace_array *tr)
irq_work_queue(&tr->fsnotify_irqwork);
}
-/*
- * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \
- * defined(CONFIG_FSNOTIFY)
- */
-#else
+#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \
+ || defined(CONFIG_OSNOISE_TRACER)
#define trace_create_maxlat_file(tr, d_tracer) \
trace_create_file("tracing_max_latency", 0644, d_tracer, \
&tr->max_latency, &tracing_max_lat_fops)
+#else
+#define trace_create_maxlat_file(tr, d_tracer) do { } while (0)
#endif
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -9473,9 +9472,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
create_trace_options_dir(tr);
-#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)
trace_create_maxlat_file(tr, d_tracer);
-#endif
if (ftrace_create_function_files(tr, d_tracer))
MEM_FAIL(1, "Could not allocate function filter files");
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 388e65d05978..8d252f63cd78 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -219,13 +219,12 @@ static int __init
trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp,
char *end, const char *key)
{
- struct xbc_node *knode, *anode;
+ struct xbc_node *anode;
const char *p;
char sep;
- knode = xbc_node_find_child(hnode, key);
- if (knode) {
- anode = xbc_node_get_child(knode);
+ p = xbc_node_find_value(hnode, key, &anode);
+ if (p) {
if (!anode) {
pr_err("hist.%s requires value(s).\n", key);
return -EINVAL;
@@ -263,9 +262,9 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
append_printf(bufp, end, ":%s(%s)", handler, p);
/* Compose 'action' parameter */
- knode = xbc_node_find_child(hnode, "trace");
+ knode = xbc_node_find_subkey(hnode, "trace");
if (!knode)
- knode = xbc_node_find_child(hnode, "save");
+ knode = xbc_node_find_subkey(hnode, "save");
if (knode) {
anode = xbc_node_get_child(knode);
@@ -284,7 +283,7 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp,
sep = ',';
}
append_printf(bufp, end, ")");
- } else if (xbc_node_find_child(hnode, "snapshot")) {
+ } else if (xbc_node_find_subkey(hnode, "snapshot")) {
append_printf(bufp, end, ".snapshot()");
} else {
pr_err("hist.%s requires an action.\n",
@@ -315,7 +314,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp,
break;
}
- if (xbc_node_find_child(hnode, param))
+ if (xbc_node_find_subkey(hnode, param))
ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param);
return ret;
@@ -375,7 +374,7 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
if (p)
append_printf(&buf, end, ":name=%s", p);
- node = xbc_node_find_child(hnode, "var");
+ node = xbc_node_find_subkey(hnode, "var");
if (node) {
xbc_node_for_each_key_value(node, knode, p) {
/* Expression must not include spaces. */
@@ -386,21 +385,21 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size)
}
/* Histogram control attributes (mutual exclusive) */
- if (xbc_node_find_child(hnode, "pause"))
+ if (xbc_node_find_value(hnode, "pause", NULL))
append_printf(&buf, end, ":pause");
- else if (xbc_node_find_child(hnode, "continue"))
+ else if (xbc_node_find_value(hnode, "continue", NULL))
append_printf(&buf, end, ":continue");
- else if (xbc_node_find_child(hnode, "clear"))
+ else if (xbc_node_find_value(hnode, "clear", NULL))
append_printf(&buf, end, ":clear");
/* Histogram handler and actions */
- node = xbc_node_find_child(hnode, "onmax");
+ node = xbc_node_find_subkey(hnode, "onmax");
if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
return -EINVAL;
- node = xbc_node_find_child(hnode, "onchange");
+ node = xbc_node_find_subkey(hnode, "onchange");
if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0)
return -EINVAL;
- node = xbc_node_find_child(hnode, "onmatch");
+ node = xbc_node_find_subkey(hnode, "onmatch");
if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0)
return -EINVAL;
@@ -437,7 +436,7 @@ trace_boot_init_histograms(struct trace_event_file *file,
}
}
- if (xbc_node_find_child(hnode, "keys")) {
+ if (xbc_node_find_subkey(hnode, "keys")) {
if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) {
tmp = kstrdup(buf, GFP_KERNEL);
if (trigger_process_regex(file, buf) < 0)
@@ -496,7 +495,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode,
else if (trigger_process_regex(file, buf) < 0)
pr_err("Failed to apply an action: %s\n", p);
}
- anode = xbc_node_find_child(enode, "hist");
+ anode = xbc_node_find_subkey(enode, "hist");
if (anode)
trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf));
} else if (xbc_node_find_value(enode, "actions", NULL))
@@ -518,7 +517,7 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node)
bool enable, enable_all = false;
const char *data;
- node = xbc_node_find_child(node, "event");
+ node = xbc_node_find_subkey(node, "event");
if (!node)
return;
/* per-event key starts with "event.GROUP.EVENT" */
@@ -621,7 +620,7 @@ trace_boot_init_instances(struct xbc_node *node)
struct trace_array *tr;
const char *p;
- node = xbc_node_find_child(node, "instance");
+ node = xbc_node_find_subkey(node, "instance");
if (!node)
return;
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index 3044b762cbd7..928867f527e7 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -119,10 +119,58 @@ static bool eprobe_dyn_event_match(const char *system, const char *event,
int argc, const char **argv, struct dyn_event *ev)
{
struct trace_eprobe *ep = to_trace_eprobe(ev);
+ const char *slash;
- return strcmp(trace_probe_name(&ep->tp), event) == 0 &&
- (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) &&
- trace_probe_match_command_args(&ep->tp, argc, argv);
+ /*
+ * We match the following:
+ * event only - match all eprobes with event name
+ * system and event only - match all system/event probes
+ *
+ * The below has the above satisfied with more arguments:
+ *
+ * attached system/event - If the arg has the system and event
+ * the probe is attached to, match
+ * probes with the attachment.
+ *
+ * If any more args are given, then it requires a full match.
+ */
+
+ /*
+ * If system exists, but this probe is not part of that system
+ * do not match.
+ */
+ if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0)
+ return false;
+
+ /* Must match the event name */
+ if (strcmp(trace_probe_name(&ep->tp), event) != 0)
+ return false;
+
+ /* No arguments match all */
+ if (argc < 1)
+ return true;
+
+ /* First argument is the system/event the probe is attached to */
+
+ slash = strchr(argv[0], '/');
+ if (!slash)
+ slash = strchr(argv[0], '.');
+ if (!slash)
+ return false;
+
+ if (strncmp(ep->event_system, argv[0], slash - argv[0]))
+ return false;
+ if (strcmp(ep->event_name, slash + 1))
+ return false;
+
+ argc--;
+ argv++;
+
+ /* If there are no other args, then match */
+ if (argc < 1)
+ return true;
+
+ return trace_probe_match_command_args(&ep->tp, argc, argv);
}
static struct dyn_event_operations eprobe_dyn_event_ops = {
@@ -632,6 +680,13 @@ static int disable_eprobe(struct trace_eprobe *ep,
trace_event_trigger_enable_disable(file, 0);
update_cond_flag(file);
+
+ /* Make sure nothing is using the edata or trigger */
+ tracepoint_synchronize_unregister();
+
+ kfree(edata);
+ kfree(trigger);
+
return 0;
}
@@ -849,8 +904,8 @@ static int __trace_eprobe_create(int argc, const char *argv[])
if (IS_ERR(ep)) {
ret = PTR_ERR(ep);
- /* This must return -ENOMEM, else there is a bug */
- WARN_ON_ONCE(ret != -ENOMEM);
+ /* This must return -ENOMEM or missing event, else there is a bug */
+ WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV);
ep = NULL;
goto error;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index a6061a69aa84..f01e442716e2 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -2506,7 +2506,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data,
* events. However, for convenience, users are allowed to directly
* specify an event field in an action, which will be automatically
* converted into a variable on their behalf.
-
+ *
* If a user specifies a field on an event that isn't the event the
* histogram currently being defined (the target event histogram), the
* only way that can be accomplished is if a new hist trigger is
diff --git a/kernel/ucount.c b/kernel/ucount.c
index bb51849e6375..eb03f3c68375 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -284,6 +284,55 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
return (new == 0);
}
+static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts,
+ struct ucounts *last, enum ucount_type type)
+{
+ struct ucounts *iter, *next;
+ for (iter = ucounts; iter != last; iter = next) {
+ long dec = atomic_long_add_return(-1, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+ next = iter->ns->ucounts;
+ if (dec == 0)
+ put_ucounts(iter);
+ }
+}
+
+void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type)
+{
+ do_dec_rlimit_put_ucounts(ucounts, NULL, type);
+}
+
+long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type)
+{
+ /* Caller must hold a reference to ucounts */
+ struct ucounts *iter;
+ long dec, ret = 0;
+
+ for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+ long max = READ_ONCE(iter->ns->ucount_max[type]);
+ long new = atomic_long_add_return(1, &iter->ucount[type]);
+ if (new < 0 || new > max)
+ goto unwind;
+ if (iter == ucounts)
+ ret = new;
+ /*
+ * Grab an extra ucount reference for the caller when
+ * the rlimit count was previously 0.
+ */
+ if (new != 1)
+ continue;
+ if (!get_ucounts(iter))
+ goto dec_unwind;
+ }
+ return ret;
+dec_unwind:
+ dec = atomic_long_add_return(-1, &iter->ucount[type]);
+ WARN_ON_ONCE(dec < 0);
+unwind:
+ do_dec_rlimit_put_ucounts(ucounts, iter, type);
+ return 0;
+}
+
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
{
struct ucounts *iter;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 33a6b4a2443d..1b3eb1e9531f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4830,8 +4830,16 @@ void show_workqueue_state(void)
for_each_pwq(pwq, wq) {
raw_spin_lock_irqsave(&pwq->pool->lock, flags);
- if (pwq->nr_active || !list_empty(&pwq->inactive_works))
+ if (pwq->nr_active || !list_empty(&pwq->inactive_works)) {
+ /*
+ * Defer printing to avoid deadlocks in console
+ * drivers that queue work while holding locks
+ * also taken in their write paths.
+ */
+ printk_deferred_enter();
show_pwq(pwq);
+ printk_deferred_exit();
+ }
raw_spin_unlock_irqrestore(&pwq->pool->lock, flags);
/*
* We could be printing a lot from atomic context, e.g.
@@ -4849,7 +4857,12 @@ void show_workqueue_state(void)
raw_spin_lock_irqsave(&pool->lock, flags);
if (pool->nr_workers == pool->nr_idle)
goto next_pool;
-
+ /*
+ * Defer printing to avoid deadlocks in console drivers that
+ * queue work while holding locks also taken in their write
+ * paths.
+ */
+ printk_deferred_enter();
pr_info("pool %d:", pool->id);
pr_cont_pool_info(pool);
pr_cont(" hung=%us workers=%d",
@@ -4864,6 +4877,7 @@ void show_workqueue_state(void)
first = false;
}
pr_cont("\n");
+ printk_deferred_exit();
next_pool:
raw_spin_unlock_irqrestore(&pool->lock, flags);
/*