diff options
Diffstat (limited to 'kernel')
40 files changed, 561 insertions, 326 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dd73a64f921..b1cb1dbf7417 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -657,7 +657,7 @@ static int audit_filter_rules(struct task_struct *tsk, result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); break; case AUDIT_SADDR_FAM: - if (ctx->sockaddr) + if (ctx && ctx->sockaddr) result = audit_comparator(ctx->sockaddr->ss_family, f->op, f->val); break; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index cebd4fb06d19..447def540544 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1072,6 +1072,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) INIT_WORK(&aux->work, prog_array_map_clear_deferred); INIT_LIST_HEAD(&aux->poke_progs); mutex_init(&aux->poke_mutex); + spin_lock_init(&aux->owner.lock); map = array_map_alloc(attr); if (IS_ERR(map)) { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index d6731c32864e..9abcc33f02cf 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -368,6 +368,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, const struct btf_type *mtype, *ptype; struct bpf_prog *prog; u32 moff; + u32 flags; moff = btf_member_bit_offset(t, member) / 8; ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL); @@ -431,10 +432,12 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; + flags = st_ops->func_models[i].ret_size > 0 ? + BPF_TRAMP_F_RET_FENTRY_RET : 0; err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, - &st_ops->func_models[i], 0, - tprogs, NULL); + &st_ops->func_models[i], + flags, tprogs, NULL); if (err < 0) goto reset_unlock; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9f4636d021b1..6e3ae90ad107 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -524,6 +524,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; +long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) @@ -817,7 +818,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void) static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ - bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, PAGE_SIZE), LONG_MAX); return 0; } @@ -827,7 +829,7 @@ int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { atomic_long_sub(pages, &bpf_jit_current); return -EPERM; } @@ -1821,20 +1823,26 @@ static unsigned int __bpf_prog_ret0_warn(const void *ctx, bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + bool ret; + if (fp->kprobe_override) return false; - if (!array->aux->type) { + spin_lock(&array->aux->owner.lock); + + if (!array->aux->owner.type) { /* There's no owner yet where we could check for * compatibility. */ - array->aux->type = fp->type; - array->aux->jited = fp->jited; - return true; + array->aux->owner.type = fp->type; + array->aux->owner.jited = fp->jited; + ret = true; + } else { + ret = array->aux->owner.type == fp->type && + array->aux->owner.jited == fp->jited; } - - return array->aux->type == fp->type && - array->aux->jited == fp->jited; + spin_unlock(&array->aux->owner.lock); + return ret; } static int bpf_check_tail_call(const struct bpf_prog *fp) diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ca3cd9aaa6ce..7b4afb7d96db 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only +// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e546b18d27da..a4b040793f44 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -1,4 +1,4 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ +/* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook */ diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index e8eefdf8cf3e..6e75bbee39f0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -63,7 +63,8 @@ static inline int stack_map_data_size(struct bpf_map *map) static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) { - u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + u64 elem_size = sizeof(struct stack_map_bucket) + + (u64)smap->map.value_size; int err; smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, @@ -179,7 +180,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -204,9 +205,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, } if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 4e50c0bfdb7d..1cad6979a0d0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -543,8 +543,10 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { array = container_of(map, struct bpf_array, map); - type = array->aux->type; - jited = array->aux->jited; + spin_lock(&array->aux->owner.lock); + type = array->aux->owner.type; + jited = array->aux->owner.jited; + spin_unlock(&array->aux->owner.lock); } seq_printf(m, @@ -1337,12 +1339,11 @@ int generic_map_update_batch(struct bpf_map *map, void __user *values = u64_to_user_ptr(attr->batch.values); void __user *keys = u64_to_user_ptr(attr->batch.keys); u32 value_size, cp, max_count; - int ufd = attr->map_fd; + int ufd = attr->batch.map_fd; void *key, *value; struct fd f; int err = 0; - f = fdget(ufd); if (attr->batch.elem_flags & ~BPF_F_LOCK) return -EINVAL; @@ -1367,6 +1368,7 @@ int generic_map_update_batch(struct bpf_map *map, return -ENOMEM; } + f = fdget(ufd); /* bpf_map_do_batch() guarantees ufd is valid */ for (cp = 0; cp < max_count; cp++) { err = -EFAULT; if (copy_from_user(key, keys + cp * map->key_size, @@ -1386,6 +1388,7 @@ int generic_map_update_batch(struct bpf_map *map, kvfree(value); kvfree(key); + fdput(f); return err; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 047ac4b4703b..de006552be8a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9912,6 +9912,8 @@ static int check_btf_line(struct bpf_verifier_env *env, nr_linfo = attr->line_info_cnt; if (!nr_linfo) return 0; + if (nr_linfo > INT_MAX / sizeof(struct bpf_line_info)) + return -EINVAL; rec_size = attr->line_info_rec_size; if (rec_size < MIN_BPF_LINEINFO_SIZE || @@ -13317,7 +13319,7 @@ BTF_SET_START(btf_non_sleepable_error_inject) /* Three functions below can be called from sleepable and non-sleepable context. * Assume non-sleepable from bpf safety point of view. */ -BTF_ID(func, __add_to_page_cache_locked) +BTF_ID(func, __filemap_add_folio) BTF_ID(func, should_fail_alloc_page) BTF_ID(func, should_failslab) BTF_SET_END(btf_non_sleepable_error_inject) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 881ce1470beb..ea08f01d0111 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2187,8 +2187,10 @@ static void cgroup_kill_sb(struct super_block *sb) * And don't kill the default root. */ if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root && - !percpu_ref_is_dying(&root->cgrp.self.refcnt)) + !percpu_ref_is_dying(&root->cgrp.self.refcnt)) { + cgroup_bpf_offline(&root->cgrp); percpu_ref_kill(&root->cgrp.self.refcnt); + } cgroup_put(&root->cgrp); kernfs_kill_sb(sb); } @@ -6572,74 +6574,51 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) */ #ifdef CONFIG_SOCK_CGROUP_DATA -#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) - -DEFINE_SPINLOCK(cgroup_sk_update_lock); -static bool cgroup_sk_alloc_disabled __read_mostly; - -void cgroup_sk_alloc_disable(void) -{ - if (cgroup_sk_alloc_disabled) - return; - pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); - cgroup_sk_alloc_disabled = true; -} - -#else - -#define cgroup_sk_alloc_disabled false - -#endif - void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; - return; - } - - /* Don't associate the sock with unrelated interrupted task's cgroup. */ - if (in_interrupt()) - return; + struct cgroup *cgroup; rcu_read_lock(); + /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) { + cgroup = &cgrp_dfl_root.cgrp; + cgroup_get(cgroup); + goto out; + } while (true) { struct css_set *cset; cset = task_css_set(current); if (likely(cgroup_tryget(cset->dfl_cgrp))) { - skcd->val = (unsigned long)cset->dfl_cgrp; - cgroup_bpf_get(cset->dfl_cgrp); + cgroup = cset->dfl_cgrp; break; } cpu_relax(); } - +out: + skcd->cgroup = cgroup; + cgroup_bpf_get(cgroup); rcu_read_unlock(); } void cgroup_sk_clone(struct sock_cgroup_data *skcd) { - if (skcd->val) { - if (skcd->no_refcnt) - return; - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); - cgroup_bpf_get(sock_cgroup_ptr(skcd)); - } + struct cgroup *cgrp = sock_cgroup_ptr(skcd); + + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(cgrp); + cgroup_bpf_get(cgrp); } void cgroup_sk_free(struct sock_cgroup_data *skcd) { struct cgroup *cgrp = sock_cgroup_ptr(skcd); - if (skcd->no_refcnt) - return; cgroup_bpf_put(cgrp); cgroup_put(cgrp); } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index df1ccf4558f8..2a9695ccb65f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -311,17 +311,19 @@ static struct cpuset top_cpuset = { if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* - * There are two global locks guarding cpuset structures - cpuset_mutex and + * There are two global locks guarding cpuset structures - cpuset_rwsem and * callback_lock. We also require taking task_lock() when dereferencing a * task's cpuset pointer. See "The task_lock() exception", at the end of this - * comment. + * comment. The cpuset code uses only cpuset_rwsem write lock. Other + * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to + * prevent change to cpuset structures. * * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it + * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it * is the only task able to also acquire callback_lock and be able to * modify cpusets. It can perform various checks on the cpuset structure * first, knowing nothing will change. It can also allocate memory while - * just holding cpuset_mutex. While it is performing these checks, various + * just holding cpuset_rwsem. While it is performing these checks, various * callback routines can briefly acquire callback_lock to query cpusets. * Once it is ready to make the changes, it takes callback_lock, blocking * everyone else. @@ -393,7 +395,7 @@ static inline bool is_in_v2_mode(void) * One way or another, we guarantee to return some non-empty subset * of cpu_online_mask. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_cpus(struct task_struct *tsk, struct cpumask *pmask) @@ -435,7 +437,7 @@ out_unlock: * One way or another, we guarantee to return some non-empty subset * of node_states[N_MEMORY]. * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { @@ -447,7 +449,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) /* * update task's spread flag if cpuset's page/slab spread flag is set * - * Call with callback_lock or cpuset_mutex held. + * Call with callback_lock or cpuset_rwsem held. */ static void cpuset_update_task_spread_flag(struct cpuset *cs, struct task_struct *tsk) @@ -468,7 +470,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, * * One cpuset is a subset of another if all its allowed CPUs and * Memory Nodes are a subset of the other, and its exclusive flags - * are only set if the other's are set. Call holding cpuset_mutex. + * are only set if the other's are set. Call holding cpuset_rwsem. */ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) @@ -577,7 +579,7 @@ static inline void free_cpuset(struct cpuset *cs) * If we replaced the flag and mask values of the current cpuset * (cur) with those values in the trial cpuset (trial), would * our various subset and exclusive rules still be valid? Presumes - * cpuset_mutex held. + * cpuset_rwsem held. * * 'cur' is the address of an actual, in-use cpuset. Operations * such as list traversal that depend on the actual address of the @@ -700,7 +702,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_unlock(); } -/* Must be called with cpuset_mutex held. */ +/* Must be called with cpuset_rwsem held. */ static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ @@ -726,7 +728,7 @@ static inline int nr_cpusets(void) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Must be called with cpuset_mutex held. + * Must be called with cpuset_rwsem held. * * The three key local variables below are: * cp - cpuset pointer, used (together with pos_css) to perform a @@ -1005,7 +1007,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], * 'cpus' is removed, then call this routine to rebuild the * scheduler's dynamic sched domains. * - * Call with cpuset_mutex held. Takes cpus_read_lock(). + * Call with cpuset_rwsem held. Takes cpus_read_lock(). */ static void rebuild_sched_domains_locked(void) { @@ -1078,7 +1080,7 @@ void rebuild_sched_domains(void) * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * * Iterate through each task of @cs updating its cpus_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_cpumask(struct cpuset *cs) @@ -1347,7 +1349,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) { @@ -1704,12 +1706,12 @@ static void *cpuset_being_rebound; * @cs: the cpuset in which each task's mems_allowed mask needs to be changed * * Iterate through each task of @cs updating its mems_allowed to the - * effective cpuset's. As this function is called with cpuset_mutex held, + * effective cpuset's. As this function is called with cpuset_rwsem held, * cpuset membership stays stable. */ static void update_tasks_nodemask(struct cpuset *cs) { - static nodemask_t newmems; /* protected by cpuset_mutex */ + static nodemask_t newmems; /* protected by cpuset_rwsem */ struct css_task_iter it; struct task_struct *task; @@ -1722,7 +1724,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * take while holding tasklist_lock. Forks can happen - the * mpol_dup() cpuset_being_rebound check will catch such forks, * and rebind their vma mempolicies too. Because we still hold - * the global cpuset_mutex, we know that no other rebind effort + * the global cpuset_rwsem, we know that no other rebind effort * will be contending for the global variable cpuset_being_rebound. * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. @@ -1768,7 +1770,7 @@ static void update_tasks_nodemask(struct cpuset *cs) * * On legacy hierarchy, effective_mems will be the same with mems_allowed. * - * Called with cpuset_mutex held + * Called with cpuset_rwsem held */ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) { @@ -1821,7 +1823,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * mempolicies and if the cpuset is marked 'memory_migrate', * migrate the tasks pages to the new memory. * - * Call with cpuset_mutex held. May take callback_lock during call. + * Call with cpuset_rwsem held. May take callback_lock during call. * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * lock each such tasks mm->mmap_lock, scan its vma's and rebind * their mempolicies to the cpusets new mems_allowed. @@ -1911,7 +1913,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) * @cs: the cpuset in which each task's spread flags needs to be changed * * Iterate through each task of @cs updating its spread flags. As this - * function is called with cpuset_mutex held, cpuset membership stays + * function is called with cpuset_rwsem held, cpuset membership stays * stable. */ static void update_tasks_flags(struct cpuset *cs) @@ -1931,7 +1933,7 @@ static void update_tasks_flags(struct cpuset *cs) * cs: the cpuset to update * turning_on: whether the flag is being set or cleared * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, @@ -1980,7 +1982,7 @@ out: * cs: the cpuset to update * new_prs: new partition root state * - * Call with cpuset_mutex held. + * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { @@ -2167,7 +2169,7 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; -/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ +/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ static int cpuset_can_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; @@ -2219,7 +2221,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset) } /* - * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach() + * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach() * but we can't allocate it dynamically there. Define it global and * allocate from cpuset_init(). */ @@ -2227,7 +2229,7 @@ static cpumask_var_t cpus_attach; static void cpuset_attach(struct cgroup_taskset *tset) { - /* static buf protected by cpuset_mutex */ + /* static buf protected by cpuset_rwsem */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; @@ -2417,7 +2419,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after - * grabbing cpuset_mutex anyway. This only happens on the legacy + * grabbing cpuset_rwsem anyway. This only happens on the legacy * hierarchies. */ css_get(&cs->css); @@ -3672,7 +3674,7 @@ void __cpuset_memory_pressure_bump(void) * - Used for /proc/<pid>/cpuset. * - No need to task_lock(tsk) on this tsk->cpuset reference, as it * doesn't really matter if tsk->cpuset changes after we read it, - * and we take cpuset_mutex, keeping cpuset_attach() from changing it + * and we take cpuset_rwsem, keeping cpuset_attach() from changing it * anyway. */ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, diff --git a/kernel/cred.c b/kernel/cred.c index f784e08c2fbd..1ae0b4948a5a 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -225,8 +225,6 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif - new->ucounts = get_ucounts(&init_ucounts); - if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -501,7 +499,7 @@ int commit_creds(struct cred *new) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); - if (new->user != old->user) + if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); @@ -669,7 +667,7 @@ int set_cred_ucounts(struct cred *new) { struct task_struct *task = current; const struct cred *old = task->real_cred; - struct ucounts *old_ucounts = new->ucounts; + struct ucounts *new_ucounts, *old_ucounts = new->ucounts; if (new->user == old->user && new->user_ns == old->user_ns) return 0; @@ -681,9 +679,10 @@ int set_cred_ucounts(struct cred *new) if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) return 0; - if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) + if (!(new_ucounts = alloc_ucounts(new->user_ns, new->euid))) return -EAGAIN; + new->ucounts = new_ucounts; if (old_ucounts) put_ucounts(old_ucounts); diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index 6c90c69e5311..7a14ca29c377 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c @@ -552,7 +552,7 @@ static void active_cacheline_remove(struct dma_debug_entry *entry) * Wrapper function for adding an entry to the hash. * This function takes care of locking itself. */ -static void add_dma_entry(struct dma_debug_entry *entry) +static void add_dma_entry(struct dma_debug_entry *entry, unsigned long attrs) { struct hash_bucket *bucket; unsigned long flags; @@ -566,8 +566,9 @@ static void add_dma_entry(struct dma_debug_entry *entry) if (rc == -ENOMEM) { pr_err("cacheline tracking ENOMEM, dma-debug disabled\n"); global_disable = true; - } else if (rc == -EEXIST) { - pr_err("cacheline tracking EEXIST, overlapping mappings aren't supported\n"); + } else if (rc == -EEXIST && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) { + err_printk(entry->dev, entry, + "cacheline tracking EEXIST, overlapping mappings aren't supported\n"); } } @@ -1190,7 +1191,8 @@ void debug_dma_map_single(struct device *dev, const void *addr, EXPORT_SYMBOL(debug_dma_map_single); void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, - size_t size, int direction, dma_addr_t dma_addr) + size_t size, int direction, dma_addr_t dma_addr, + unsigned long attrs) { struct dma_debug_entry *entry; @@ -1221,7 +1223,7 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, check_for_illegal_area(dev, addr, size); } - add_dma_entry(entry); + add_dma_entry(entry, attrs); } void debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr) @@ -1279,7 +1281,8 @@ void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, } void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction) + int nents, int mapped_ents, int direction, + unsigned long attrs) { struct dma_debug_entry *entry; struct scatterlist *s; @@ -1288,6 +1291,12 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, if (unlikely(dma_debug_disabled())) return; + for_each_sg(sg, s, nents, i) { + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) + check_for_illegal_area(dev, sg_virt(s), s->length); + } + for_each_sg(sg, s, mapped_ents, i) { entry = dma_entry_alloc(); if (!entry) @@ -1303,15 +1312,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; - check_for_stack(dev, sg_page(s), s->offset); - - if (!PageHighMem(sg_page(s))) { - check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); - } - check_sg_segment(dev, s); - add_dma_entry(entry); + add_dma_entry(entry, attrs); } } @@ -1367,7 +1370,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, } void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt) + dma_addr_t dma_addr, void *virt, + unsigned long attrs) { struct dma_debug_entry *entry; @@ -1397,7 +1401,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, else entry->pfn = page_to_pfn(virt_to_page(virt)); - add_dma_entry(entry); + add_dma_entry(entry, attrs); } void debug_dma_free_coherent(struct device *dev, size_t size, @@ -1428,7 +1432,8 @@ void debug_dma_free_coherent(struct device *dev, size_t size, } void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, - int direction, dma_addr_t dma_addr) + int direction, dma_addr_t dma_addr, + unsigned long attrs) { struct dma_debug_entry *entry; @@ -1448,7 +1453,7 @@ void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, entry->direction = direction; entry->map_err_type = MAP_ERR_NOT_CHECKED; - add_dma_entry(entry); + add_dma_entry(entry, attrs); } void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, diff --git a/kernel/dma/debug.h b/kernel/dma/debug.h index 83643b3010b2..f525197d3cae 100644 --- a/kernel/dma/debug.h +++ b/kernel/dma/debug.h @@ -11,26 +11,30 @@ #ifdef CONFIG_DMA_API_DEBUG extern void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, - int direction, dma_addr_t dma_addr); + int direction, dma_addr_t dma_addr, + unsigned long attrs); extern void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, size_t size, int direction); extern void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction); + int nents, int mapped_ents, int direction, + unsigned long attrs); extern void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir); extern void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt); + dma_addr_t dma_addr, void *virt, + unsigned long attrs); extern void debug_dma_free_coherent(struct device *dev, size_t size, void *virt, dma_addr_t addr); extern void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, int direction, - dma_addr_t dma_addr); + dma_addr_t dma_addr, + unsigned long attrs); extern void debug_dma_unmap_resource(struct device *dev, dma_addr_t dma_addr, size_t size, int direction); @@ -53,7 +57,8 @@ extern void debug_dma_sync_sg_for_device(struct device *dev, #else /* CONFIG_DMA_API_DEBUG */ static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, - int direction, dma_addr_t dma_addr) + int direction, dma_addr_t dma_addr, + unsigned long attrs) { } @@ -63,7 +68,8 @@ static inline void debug_dma_unmap_page(struct device *dev, dma_addr_t addr, } static inline void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, - int nents, int mapped_ents, int direction) + int nents, int mapped_ents, int direction, + unsigned long attrs) { } @@ -74,7 +80,8 @@ static inline void debug_dma_unmap_sg(struct device *dev, } static inline void debug_dma_alloc_coherent(struct device *dev, size_t size, - dma_addr_t dma_addr, void *virt) + dma_addr_t dma_addr, void *virt, + unsigned long attrs) { } @@ -85,7 +92,8 @@ static inline void debug_dma_free_coherent(struct device *dev, size_t size, static inline void debug_dma_map_resource(struct device *dev, phys_addr_t addr, size_t size, int direction, - dma_addr_t dma_addr) + dma_addr_t dma_addr, + unsigned long attrs) { } diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 7ee5284bff58..8349a9f2c345 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -156,7 +156,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); - debug_dma_map_page(dev, page, offset, size, dir, addr); + debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; } @@ -195,7 +195,7 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, ents = ops->map_sg(dev, sg, nents, dir, attrs); if (ents > 0) - debug_dma_map_sg(dev, sg, nents, ents, dir); + debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && ents != -EIO)) return -EIO; @@ -206,7 +206,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, /** * dma_map_sg_attrs - Map the given buffer for DMA * @dev: The device for which to perform the DMA operation - * @sg: The sg_table object describing the buffer + * @sg: The sg_table object describing the buffer + * @nents: Number of entries to map * @dir: DMA direction * @attrs: Optional DMA attributes for the map operation * @@ -248,12 +249,12 @@ EXPORT_SYMBOL(dma_map_sg_attrs); * Returns 0 on success or a negative error code on error. The following * error codes are supported with the given meaning: * - * -EINVAL - An invalid argument, unaligned access or other error - * in usage. Will not succeed if retried. - * -ENOMEM - Insufficient resources (like memory or IOVA space) to - * complete the mapping. Should succeed if retried later. - * -EIO - Legacy error code with an unknown meaning. eg. this is - * returned if a lower level call returned DMA_MAPPING_ERROR. + * -EINVAL An invalid argument, unaligned access or other error + * in usage. Will not succeed if retried. + * -ENOMEM Insufficient resources (like memory or IOVA space) to + * complete the mapping. Should succeed if retried later. + * -EIO Legacy error code with an unknown meaning. eg. this is + * returned if a lower level call returned DMA_MAPPING_ERROR. */ int dma_map_sgtable(struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) @@ -304,7 +305,7 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, else if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); - debug_dma_map_resource(dev, phys_addr, size, dir, addr); + debug_dma_map_resource(dev, phys_addr, size, dir, addr, attrs); return addr; } EXPORT_SYMBOL(dma_map_resource); @@ -509,7 +510,7 @@ void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, else return NULL; - debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); + debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr, attrs); return cpu_addr; } EXPORT_SYMBOL(dma_alloc_attrs); @@ -565,7 +566,7 @@ struct page *dma_alloc_pages(struct device *dev, size_t size, struct page *page = __dma_alloc_pages(dev, size, dma_handle, dir, gfp); if (page) - debug_dma_map_page(dev, page, 0, size, dir, *dma_handle); + debug_dma_map_page(dev, page, 0, size, dir, *dma_handle, 0); return page; } EXPORT_SYMBOL_GPL(dma_alloc_pages); @@ -643,7 +644,7 @@ struct sg_table *dma_alloc_noncontiguous(struct device *dev, size_t size, if (sgt) { sgt->nents = 1; - debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir); + debug_dma_map_sg(dev, sgt->sgl, sgt->orig_nents, 1, dir, attrs); } return sgt; } diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bf16395b9e13..d5a61d565ad5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -171,10 +171,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) handle_signal_work(regs, ti_work); - if (ti_work & _TIF_NOTIFY_RESUME) { + if (ti_work & _TIF_NOTIFY_RESUME) tracehook_notify_resume(regs); - rseq_handle_notify_resume(NULL, regs); - } /* Architecture specific TIF work */ arch_exit_to_user_mode_work(regs, ti_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 744e8726c5b2..f23ca260307f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3707,6 +3707,29 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, return 0; } +static inline bool event_update_userpage(struct perf_event *event) +{ + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); + perf_event_update_userpage(event); + + return true; +} + +static inline void group_update_userpage(struct perf_event *group_event) +{ + struct perf_event *event; + + if (!event_update_userpage(group_event)) + return; + + for_each_sibling_event(event, group_event) + event_update_userpage(event); +} + static int merge_sched_in(struct perf_event *event, void *data) { struct perf_event_context *ctx = event->ctx; @@ -3725,14 +3748,15 @@ static int merge_sched_in(struct perf_event *event, void *data) } if (event->state == PERF_EVENT_STATE_INACTIVE) { + *can_add_hw = 0; if (event->attr.pinned) { perf_cgroup_event_disable(event, ctx); perf_event_set_state(event, PERF_EVENT_STATE_ERROR); + } else { + ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); + group_update_userpage(event); } - - *can_add_hw = 0; - ctx->rotate_necessary = 1; - perf_mux_hrtimer_restart(cpuctx); } return 0; @@ -6324,6 +6348,8 @@ accounting: ring_buffer_attach(event, rb); + perf_event_update_time(event); + perf_set_shadow_time(event, event->ctx); perf_event_init_userpage(event); perf_event_update_userpage(event); } else { @@ -10193,7 +10219,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event) return; if (ifh->nr_file_filters) { - mm = get_task_mm(event->ctx->task); + mm = get_task_mm(task); if (!mm) goto restart; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index af24dc3febbe..6357c3580d07 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL); + err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, + GFP_KERNEL); if (err) return err; } diff --git a/kernel/futex.c b/kernel/futex.c index e7b4c6121da4..c15ad276fd15 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1263,6 +1263,36 @@ static int handle_exit_race(u32 __user *uaddr, u32 uval, return -ESRCH; } +static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key, + struct futex_pi_state **ps) +{ + /* + * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. + */ + struct futex_pi_state *pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make @p + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + /* + * Assignment without holding pi_state->pi_mutex.wait_lock is safe + * because there is no concurrency as the object is not published yet. + */ + pi_state->owner = p; + + *ps = pi_state; +} /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. @@ -1272,7 +1302,6 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, struct task_struct **exiting) { pid_t pid = uval & FUTEX_TID_MASK; - struct futex_pi_state *pi_state; struct task_struct *p; /* @@ -1324,36 +1353,11 @@ static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, return ret; } - /* - * No existing pi state. First waiter. [2] - * - * This creates pi_state, we have hb->lock held, this means nothing can - * observe this state, wait_lock is irrelevant. - */ - pi_state = alloc_pi_state(); - - /* - * Initialize the pi_mutex in locked state and make @p - * the owner of it: - */ - rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); - - /* Store the key for possible exit cleanups: */ - pi_state->key = *key; - - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &p->pi_state_list); - /* - * Assignment without holding pi_state->pi_mutex.wait_lock is safe - * because there is no concurrency as the object is not published yet. - */ - pi_state->owner = p; + __attach_to_pi_owner(p, key, ps); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); - *ps = pi_state; - return 0; } @@ -1454,8 +1458,26 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, newval |= FUTEX_WAITERS; ret = lock_pi_update_atomic(uaddr, uval, newval); - /* If the take over worked, return 1 */ - return ret < 0 ? ret : 1; + if (ret) + return ret; + + /* + * If the waiter bit was requested the caller also needs PI + * state attached to the new owner of the user space futex. + * + * @task is guaranteed to be alive and it cannot be exiting + * because it is either sleeping or waiting in + * futex_requeue_pi_wakeup_sync(). + * + * No need to do the full attach_to_pi_owner() exercise + * because @task is known and valid. + */ + if (set_waiters) { + raw_spin_lock_irq(&task->pi_lock); + __attach_to_pi_owner(task, key, ps); + raw_spin_unlock_irq(&task->pi_lock); + } + return 1; } /* @@ -1939,12 +1961,26 @@ static inline int futex_requeue_pi_wakeup_sync(struct futex_q *q) * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the - * target futex if it is uncontended or via a lock steal. Set the futex_q key - * to the requeue target futex so the waiter can detect the wakeup on the right - * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock - * to protect access to the pi_state to fixup the owner later. Must be called - * with both q->lock_ptr and hb->lock held. + * target futex if it is uncontended or via a lock steal. + * + * 1) Set @q::key to the requeue target futex key so the waiter can detect + * the wakeup on the right futex. + * + * 2) Dequeue @q from the hash bucket. + * + * 3) Set @q::rt_waiter to NULL so the woken up task can detect atomic lock + * acquisition. + * + * 4) Set the q->lock_ptr to the requeue target hb->lock for the case that + * the waiter has to fixup the pi state. + * + * 5) Complete the requeue state so the waiter can make progress. After + * this point the waiter task can return from the syscall immediately in + * case that the pi state does not have to be fixed up. + * + * 6) Wake the waiter task. + * + * Must be called with both q->lock_ptr and hb->lock held. */ static inline void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, @@ -1998,7 +2034,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, { struct futex_q *top_waiter = NULL; u32 curval; - int ret, vpid; + int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; @@ -2025,7 +2061,7 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, * and waiting on the 'waitqueue' futex which is always !PI. */ if (!top_waiter->rt_waiter || top_waiter->pi_state) - ret = -EINVAL; + return -EINVAL; /* Ensure we requeue to the expected futex. */ if (!match_futex(top_waiter->requeue_pi_key, key2)) @@ -2036,17 +2072,23 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, return -EAGAIN; /* - * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in - * the contended case or if set_waiters is 1. The pi_state is returned - * in ps in contended cases. + * Try to take the lock for top_waiter and set the FUTEX_WAITERS bit + * in the contended case or if @set_waiters is true. + * + * In the contended case PI state is attached to the lock owner. If + * the user space lock can be acquired then PI state is attached to + * the new owner (@top_waiter->task) when @set_waiters is true. */ - vpid = task_pid_vnr(top_waiter->task); ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, exiting, set_waiters); if (ret == 1) { - /* Dequeue, wake up and update top_waiter::requeue_state */ + /* + * Lock was acquired in user space and PI state was + * attached to @top_waiter->task. That means state is fully + * consistent and the waiter can return to user space + * immediately after the wakeup. + */ requeue_pi_wake_futex(top_waiter, key2, hb2); - return vpid; } else if (ret < 0) { /* Rewind top_waiter::requeue_state */ futex_requeue_pi_complete(top_waiter, ret); @@ -2208,19 +2250,26 @@ retry_private: &exiting, nr_requeue); /* - * At this point the top_waiter has either taken uaddr2 or is - * waiting on it. If the former, then the pi_state will not - * exist yet, look it up one more time to ensure we have a - * reference to it. If the lock was taken, @ret contains the - * VPID of the top waiter task. - * If the lock was not taken, we have pi_state and an initial - * refcount on it. In case of an error we have nothing. + * At this point the top_waiter has either taken uaddr2 or + * is waiting on it. In both cases pi_state has been + * established and an initial refcount on it. In case of an + * error there's nothing. * * The top waiter's requeue_state is up to date: * - * - If the lock was acquired atomically (ret > 0), then + * - If the lock was acquired atomically (ret == 1), then * the state is Q_REQUEUE_PI_LOCKED. * + * The top waiter has been dequeued and woken up and can + * return to user space immediately. The kernel/user + * space state is consistent. In case that there must be + * more waiters requeued the WAITERS bit in the user + * space futex is set so the top waiter task has to go + * into the syscall slowpath to unlock the futex. This + * will block until this requeue operation has been + * completed and the hash bucket locks have been + * dropped. + * * - If the trylock failed with an error (ret < 0) then * the state is either Q_REQUEUE_PI_NONE, i.e. "nothing * happened", or Q_REQUEUE_PI_IGNORE when there was an @@ -2234,36 +2283,20 @@ retry_private: * the same sanity checks for requeue_pi as the loop * below does. */ - if (ret > 0) { - WARN_ON(pi_state); - task_count++; - /* - * If futex_proxy_trylock_atomic() acquired the - * user space futex, then the user space value - * @uaddr2 has been set to the @hb1's top waiter - * task VPID. This task is guaranteed to be alive - * and cannot be exiting because it is either - * sleeping or blocked on @hb2 lock. - * - * The @uaddr2 futex cannot have waiters either as - * otherwise futex_proxy_trylock_atomic() would not - * have succeeded. - * - * In order to requeue waiters to @hb2, pi state is - * required. Hand in the VPID value (@ret) and - * allocate PI state with an initial refcount on - * it. - */ - ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state, - &exiting); - WARN_ON(ret); - } - switch (ret) { case 0: /* We hold a reference on the pi state. */ break; + case 1: + /* + * futex_proxy_trylock_atomic() acquired the user space + * futex. Adjust task_count. + */ + task_count++; + ret = 0; + break; + /* * If the above failed, then pi_state is NULL and * waiter::requeue_state is correct. @@ -2395,9 +2428,8 @@ retry_private: } /* - * We took an extra initial reference to the pi_state either in - * futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need - * to drop it here again. + * We took an extra initial reference to the pi_state in + * futex_proxy_trylock_atomic(). We need to drop it here again. */ put_pi_state(pi_state); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 19e83e9b723c..4d8fc65cf38f 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -136,7 +136,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); * Allocates and initializes an irq_domain structure. * Returns pointer to IRQ domain, or NULL on failure. */ -struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, +struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, unsigned int size, irq_hw_number_t hwirq_max, int direct_max, const struct irq_domain_ops *ops, void *host_data) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8eabdc79602b..6bb116c559b4 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -753,7 +753,7 @@ static int __sched rt_mutex_adjust_prio_chain(struct task_struct *task, * other configuration and we fail to report; also, see * lockdep. */ - if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) + if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter && orig_waiter->ww_ctx) ret = 0; raw_spin_unlock(&lock->wait_lock); diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 4ba15088e640..88191f6e252c 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -41,6 +41,12 @@ * The risk of writer starvation is there, but the pathological use cases * which trigger it are not necessarily the typical RT workloads. * + * Fast-path orderings: + * The lock/unlock of readers can run in fast paths: lock and unlock are only + * atomic ops, and there is no inner lock to provide ACQUIRE and RELEASE + * semantics of rwbase_rt. Atomic ops should thus provide _acquire() + * and _release() (or stronger). + * * Common code shared between RT rw_semaphore and rwlock */ @@ -53,6 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) * set. */ for (r = atomic_read(&rwb->readers); r < 0;) { + /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */ if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) return 1; } @@ -162,6 +169,8 @@ static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, /* * rwb->readers can only hit 0 when a writer is waiting for the * active readers to leave the critical section. + * + * dec_and_test() is fully ordered, provides RELEASE. */ if (unlikely(atomic_dec_and_test(&rwb->readers))) __rwbase_read_unlock(rwb, state); @@ -172,7 +181,11 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, { struct rt_mutex_base *rtm = &rwb->rtmutex; - atomic_add(READER_BIAS - bias, &rwb->readers); + /* + * _release() is needed in case that reader is in fast path, pairing + * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE + */ + (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); rwbase_rtmutex_unlock(rtm); } @@ -196,6 +209,23 @@ static inline void rwbase_write_downgrade(struct rwbase_rt *rwb) __rwbase_write_unlock(rwb, WRITER_BIAS - 1, flags); } +static inline bool __rwbase_write_trylock(struct rwbase_rt *rwb) +{ + /* Can do without CAS because we're serialized by wait_lock. */ + lockdep_assert_held(&rwb->rtmutex.wait_lock); + + /* + * _acquire is needed in case the reader is in the fast path, pairing + * with rwbase_read_unlock(), provides ACQUIRE. + */ + if (!atomic_read_acquire(&rwb->readers)) { + atomic_set(&rwb->readers, WRITER_BIAS); + return 1; + } + + return 0; +} + static int __sched rwbase_write_lock(struct rwbase_rt *rwb, unsigned int state) { @@ -210,34 +240,30 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb, atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * set_current_state() for rw_semaphore - * current_save_and_set_rtlock_wait_state() for rwlock - */ - rwbase_set_and_save_current_state(state); + if (__rwbase_write_trylock(rwb)) + goto out_unlock; - /* Block until all readers have left the critical section. */ - for (; atomic_read(&rwb->readers);) { + rwbase_set_and_save_current_state(state); + for (;;) { /* Optimized out for rwlocks */ if (rwbase_signal_pending_state(state, current)) { - __set_current_state(TASK_RUNNING); + rwbase_restore_current_state(); __rwbase_write_unlock(rwb, 0, flags); return -EINTR; } + + if (__rwbase_write_trylock(rwb)) + break; + raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); + rwbase_schedule(); + raw_spin_lock_irqsave(&rtm->wait_lock, flags); - /* - * Schedule and wait for the readers to leave the critical - * section. The last reader leaving it wakes the waiter. - */ - if (atomic_read(&rwb->readers) != 0) - rwbase_schedule(); set_current_state(state); - raw_spin_lock_irqsave(&rtm->wait_lock, flags); } - - atomic_set(&rwb->readers, WRITER_BIAS); rwbase_restore_current_state(); + +out_unlock: raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 0; } @@ -253,8 +279,7 @@ static inline int rwbase_write_trylock(struct rwbase_rt *rwb) atomic_sub(READER_BIAS, &rwb->readers); raw_spin_lock_irqsave(&rtm->wait_lock, flags); - if (!atomic_read(&rwb->readers)) { - atomic_set(&rwb->readers, WRITER_BIAS); + if (__rwbase_write_trylock(rwb)) { raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); return 1; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 9215b4d6a9de..000e8d5a2884 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1376,15 +1376,17 @@ static inline void __downgrade_write(struct rw_semaphore *sem) #include "rwbase_rt.c" -#ifdef CONFIG_DEBUG_LOCK_ALLOC -void __rwsem_init(struct rw_semaphore *sem, const char *name, +void __init_rwsem(struct rw_semaphore *sem, const char *name, struct lock_class_key *key) { + init_rwbase_rt(&(sem)->rwbase); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map_wait(&sem->dep_map, name, key, 0, LD_WAIT_SLEEP); -} -EXPORT_SYMBOL(__rwsem_init); #endif +} +EXPORT_SYMBOL(__init_rwsem); static inline void __down_read(struct rw_semaphore *sem) { diff --git a/kernel/module.c b/kernel/module.c index 40ec9a030eec..5c26a76e800b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4489,8 +4489,10 @@ static void cfi_init(struct module *mod) /* Fix init/exit functions to point to the CFI jump table */ if (init) mod->init = *init; +#ifdef CONFIG_MODULE_UNLOAD if (exit) mod->exit = *exit; +#endif cfi_module_add(mod, module_addr_min); #endif diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 825277e1e742..a8d0a58deebc 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1166,9 +1166,9 @@ void __init setup_log_buf(int early) return; err_free_descs: - memblock_free(__pa(new_descs), new_descs_size); + memblock_free_ptr(new_descs, new_descs_size); err_free_log_buf: - memblock_free(__pa(new_log_buf), new_log_buf_len); + memblock_free_ptr(new_log_buf, new_log_buf_len); } static bool __read_mostly ignore_loglevel; diff --git a/kernel/rseq.c b/kernel/rseq.c index 35f7bd0fced0..6d45ac3dae7f 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c @@ -282,9 +282,17 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) if (unlikely(t->flags & PF_EXITING)) return; - ret = rseq_ip_fixup(regs); - if (unlikely(ret < 0)) - goto error; + + /* + * regs is NULL if and only if the caller is in a syscall path. Skip + * fixup and leave rseq_cs as is so that rseq_sycall() will detect and + * kill a misbehaving userspace on debug kernels. + */ + if (regs) { + ret = rseq_ip_fixup(regs); + if (unlikely(ret < 0)) + goto error; + } if (unlikely(rseq_update_cpu_id(t))) goto error; return; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c4462c454ab9..f21714ea3db8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8795,6 +8795,7 @@ void idle_task_exit(void) finish_arch_post_lock_switch(); } + scs_task_reset(current); /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } @@ -8836,7 +8837,6 @@ static void balance_push(struct rq *rq) struct task_struct *push_task = rq->curr; lockdep_assert_rq_held(rq); - SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* * Ensure the thing is persistent until balance_push_set(.on = false); @@ -8844,9 +8844,10 @@ static void balance_push(struct rq *rq) rq->balance_callback = &balance_push_callback; /* - * Only active while going offline. + * Only active while going offline and when invoked on the outgoing + * CPU. */ - if (!cpu_dying(rq->cpu)) + if (!cpu_dying(rq->cpu) || rq != this_rq()) return; /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 49716228efb4..17a653b67006 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -173,16 +173,22 @@ static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { char buf[16]; + unsigned int scaling; if (cnt > 15) cnt = 15; if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; - if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling)) + if (kstrtouint(buf, 10, &scaling)) return -EINVAL; + if (scaling >= SCHED_TUNABLESCALING_END) + return -EINVAL; + + sysctl_sched_tunable_scaling = scaling; if (sched_update_scaling()) return -EINVAL; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ff69f245b939..f6a05d9b5443 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4936,8 +4936,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - if (!cfs_rq->load.weight) + /* Nothing to run but something to decay (on_list)? Complete the branch */ + if (!cfs_rq->load.weight) { + if (cfs_rq->on_list) + goto unthrottle_throttle; return; + } task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 912b47aa99d8..d17b0a5ce6ac 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -379,10 +379,10 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) cpuidle_use_deepest_state(latency_ns); it.done = 0; - hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); it.timer.function = idle_inject_timer_fn; hrtimer_start(&it.timer, ns_to_ktime(duration_ns), - HRTIMER_MODE_REL_PINNED); + HRTIMER_MODE_REL_PINNED_HARD); while (!READ_ONCE(it.done)) do_idle(); diff --git a/kernel/signal.c b/kernel/signal.c index 952741f6d0f9..487bf4f5dadf 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -426,22 +426,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, */ rcu_read_lock(); ucounts = task_ucounts(t); - sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); - switch (sigpending) { - case 1: - if (likely(get_ucounts(ucounts))) - break; - fallthrough; - case LONG_MAX: - /* - * we need to decrease the ucount in the userns tree on any - * failure to avoid counts leaking. - */ - dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); - rcu_read_unlock(); - return NULL; - } + sigpending = inc_rlimit_get_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); rcu_read_unlock(); + if (!sigpending) + return NULL; if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); @@ -450,8 +438,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, } if (unlikely(q == NULL)) { - if (dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) - put_ucounts(ucounts); + dec_rlimit_put_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING); } else { INIT_LIST_HEAD(&q->list); q->flags = sigqueue_flags; @@ -464,8 +451,8 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { - put_ucounts(q->ucounts); + if (q->ucounts) { + dec_rlimit_put_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING); q->ucounts = NULL; } kmem_cache_free(sigqueue_cachep, q); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index ee736861b18f..643d412ac623 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -1404,7 +1404,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clkid, } } - *newval += now; + if (*newval) + *newval += now; } /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c221e4c3f625..fa91f398f28b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1605,6 +1605,14 @@ static int blk_trace_remove_queue(struct request_queue *q) if (bt == NULL) return -EINVAL; + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + spin_lock_irq(&running_trace_lock); + list_del_init(&bt->running_list); + spin_unlock_irq(&running_trace_lock); + relay_flush(bt->rchan); + } + put_probe_ref(); synchronize_rcu(); blk_trace_free(bt); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7efbc8aaf7f6..feebf57c6458 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2208,7 +2208,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) } /** - * ftrace_update_record, set a record that now is tracing or not + * ftrace_update_record - set a record that now is tracing or not * @rec: the record to update * @enable: set to true if the record is tracing, false to force disable * @@ -2221,7 +2221,7 @@ int ftrace_update_record(struct dyn_ftrace *rec, bool enable) } /** - * ftrace_test_record, check if the record has been enabled or not + * ftrace_test_record - check if the record has been enabled or not * @rec: the record to test * @enable: set to true to check if enabled, false if it is disabled * @@ -2574,7 +2574,7 @@ struct ftrace_rec_iter { }; /** - * ftrace_rec_iter_start, start up iterating over traced functions + * ftrace_rec_iter_start - start up iterating over traced functions * * Returns an iterator handle that is used to iterate over all * the records that represent address locations where functions @@ -2605,7 +2605,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_start(void) } /** - * ftrace_rec_iter_next, get the next record to process. + * ftrace_rec_iter_next - get the next record to process. * @iter: The handle to the iterator. * * Returns the next iterator after the given iterator @iter. @@ -2630,7 +2630,7 @@ struct ftrace_rec_iter *ftrace_rec_iter_next(struct ftrace_rec_iter *iter) } /** - * ftrace_rec_iter_record, get the record at the iterator location + * ftrace_rec_iter_record - get the record at the iterator location * @iter: The current iterator location * * Returns the record that the current @iter is at. @@ -2733,7 +2733,7 @@ static int __ftrace_modify_code(void *data) } /** - * ftrace_run_stop_machine, go back to the stop machine method + * ftrace_run_stop_machine - go back to the stop machine method * @command: The command to tell ftrace what to do * * If an arch needs to fall back to the stop machine method, the @@ -2745,7 +2745,7 @@ void ftrace_run_stop_machine(int command) } /** - * arch_ftrace_update_code, modify the code to trace or not trace + * arch_ftrace_update_code - modify the code to trace or not trace * @command: The command that needs to be done * * Archs can override this function if it does not need to @@ -6977,7 +6977,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START); if (bit < 0) return; @@ -7052,7 +7052,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START); if (bit < 0) return; @@ -7525,7 +7525,9 @@ void ftrace_kill(void) } /** - * Test if ftrace is dead or not. + * ftrace_is_dead - Test if ftrace is dead or not. + * + * Returns 1 if ftrace is "dead", zero otherwise. */ int ftrace_is_dead(void) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7896d30d90f7..bc677cd64224 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1744,16 +1744,15 @@ void latency_fsnotify(struct trace_array *tr) irq_work_queue(&tr->fsnotify_irqwork); } -/* - * (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ - * defined(CONFIG_FSNOTIFY) - */ -#else +#elif defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) \ + || defined(CONFIG_OSNOISE_TRACER) #define trace_create_maxlat_file(tr, d_tracer) \ trace_create_file("tracing_max_latency", 0644, d_tracer, \ &tr->max_latency, &tracing_max_lat_fops) +#else +#define trace_create_maxlat_file(tr, d_tracer) do { } while (0) #endif #ifdef CONFIG_TRACER_MAX_TRACE @@ -9473,9 +9472,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) create_trace_options_dir(tr); -#if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) trace_create_maxlat_file(tr, d_tracer); -#endif if (ftrace_create_function_files(tr, d_tracer)) MEM_FAIL(1, "Could not allocate function filter files"); diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 388e65d05978..8d252f63cd78 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -219,13 +219,12 @@ static int __init trace_boot_hist_add_array(struct xbc_node *hnode, char **bufp, char *end, const char *key) { - struct xbc_node *knode, *anode; + struct xbc_node *anode; const char *p; char sep; - knode = xbc_node_find_child(hnode, key); - if (knode) { - anode = xbc_node_get_child(knode); + p = xbc_node_find_value(hnode, key, &anode); + if (p) { if (!anode) { pr_err("hist.%s requires value(s).\n", key); return -EINVAL; @@ -263,9 +262,9 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, append_printf(bufp, end, ":%s(%s)", handler, p); /* Compose 'action' parameter */ - knode = xbc_node_find_child(hnode, "trace"); + knode = xbc_node_find_subkey(hnode, "trace"); if (!knode) - knode = xbc_node_find_child(hnode, "save"); + knode = xbc_node_find_subkey(hnode, "save"); if (knode) { anode = xbc_node_get_child(knode); @@ -284,7 +283,7 @@ trace_boot_hist_add_one_handler(struct xbc_node *hnode, char **bufp, sep = ','; } append_printf(bufp, end, ")"); - } else if (xbc_node_find_child(hnode, "snapshot")) { + } else if (xbc_node_find_subkey(hnode, "snapshot")) { append_printf(bufp, end, ".snapshot()"); } else { pr_err("hist.%s requires an action.\n", @@ -315,7 +314,7 @@ trace_boot_hist_add_handlers(struct xbc_node *hnode, char **bufp, break; } - if (xbc_node_find_child(hnode, param)) + if (xbc_node_find_subkey(hnode, param)) ret = trace_boot_hist_add_one_handler(hnode, bufp, end, handler, param); return ret; @@ -375,7 +374,7 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) if (p) append_printf(&buf, end, ":name=%s", p); - node = xbc_node_find_child(hnode, "var"); + node = xbc_node_find_subkey(hnode, "var"); if (node) { xbc_node_for_each_key_value(node, knode, p) { /* Expression must not include spaces. */ @@ -386,21 +385,21 @@ trace_boot_compose_hist_cmd(struct xbc_node *hnode, char *buf, size_t size) } /* Histogram control attributes (mutual exclusive) */ - if (xbc_node_find_child(hnode, "pause")) + if (xbc_node_find_value(hnode, "pause", NULL)) append_printf(&buf, end, ":pause"); - else if (xbc_node_find_child(hnode, "continue")) + else if (xbc_node_find_value(hnode, "continue", NULL)) append_printf(&buf, end, ":continue"); - else if (xbc_node_find_child(hnode, "clear")) + else if (xbc_node_find_value(hnode, "clear", NULL)) append_printf(&buf, end, ":clear"); /* Histogram handler and actions */ - node = xbc_node_find_child(hnode, "onmax"); + node = xbc_node_find_subkey(hnode, "onmax"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onchange"); + node = xbc_node_find_subkey(hnode, "onchange"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "var") < 0) return -EINVAL; - node = xbc_node_find_child(hnode, "onmatch"); + node = xbc_node_find_subkey(hnode, "onmatch"); if (node && trace_boot_hist_add_handlers(node, &buf, end, "event") < 0) return -EINVAL; @@ -437,7 +436,7 @@ trace_boot_init_histograms(struct trace_event_file *file, } } - if (xbc_node_find_child(hnode, "keys")) { + if (xbc_node_find_subkey(hnode, "keys")) { if (trace_boot_compose_hist_cmd(hnode, buf, size) == 0) { tmp = kstrdup(buf, GFP_KERNEL); if (trigger_process_regex(file, buf) < 0) @@ -496,7 +495,7 @@ trace_boot_init_one_event(struct trace_array *tr, struct xbc_node *gnode, else if (trigger_process_regex(file, buf) < 0) pr_err("Failed to apply an action: %s\n", p); } - anode = xbc_node_find_child(enode, "hist"); + anode = xbc_node_find_subkey(enode, "hist"); if (anode) trace_boot_init_histograms(file, anode, buf, ARRAY_SIZE(buf)); } else if (xbc_node_find_value(enode, "actions", NULL)) @@ -518,7 +517,7 @@ trace_boot_init_events(struct trace_array *tr, struct xbc_node *node) bool enable, enable_all = false; const char *data; - node = xbc_node_find_child(node, "event"); + node = xbc_node_find_subkey(node, "event"); if (!node) return; /* per-event key starts with "event.GROUP.EVENT" */ @@ -621,7 +620,7 @@ trace_boot_init_instances(struct xbc_node *node) struct trace_array *tr; const char *p; - node = xbc_node_find_child(node, "instance"); + node = xbc_node_find_subkey(node, "instance"); if (!node) return; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 3044b762cbd7..928867f527e7 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -119,10 +119,58 @@ static bool eprobe_dyn_event_match(const char *system, const char *event, int argc, const char **argv, struct dyn_event *ev) { struct trace_eprobe *ep = to_trace_eprobe(ev); + const char *slash; - return strcmp(trace_probe_name(&ep->tp), event) == 0 && - (!system || strcmp(trace_probe_group_name(&ep->tp), system) == 0) && - trace_probe_match_command_args(&ep->tp, argc, argv); + /* + * We match the following: + * event only - match all eprobes with event name + * system and event only - match all system/event probes + * + * The below has the above satisfied with more arguments: + * + * attached system/event - If the arg has the system and event + * the probe is attached to, match + * probes with the attachment. + * + * If any more args are given, then it requires a full match. + */ + + /* + * If system exists, but this probe is not part of that system + * do not match. + */ + if (system && strcmp(trace_probe_group_name(&ep->tp), system) != 0) + return false; + + /* Must match the event name */ + if (strcmp(trace_probe_name(&ep->tp), event) != 0) + return false; + + /* No arguments match all */ + if (argc < 1) + return true; + + /* First argument is the system/event the probe is attached to */ + + slash = strchr(argv[0], '/'); + if (!slash) + slash = strchr(argv[0], '.'); + if (!slash) + return false; + + if (strncmp(ep->event_system, argv[0], slash - argv[0])) + return false; + if (strcmp(ep->event_name, slash + 1)) + return false; + + argc--; + argv++; + + /* If there are no other args, then match */ + if (argc < 1) + return true; + + return trace_probe_match_command_args(&ep->tp, argc, argv); } static struct dyn_event_operations eprobe_dyn_event_ops = { @@ -632,6 +680,13 @@ static int disable_eprobe(struct trace_eprobe *ep, trace_event_trigger_enable_disable(file, 0); update_cond_flag(file); + + /* Make sure nothing is using the edata or trigger */ + tracepoint_synchronize_unregister(); + + kfree(edata); + kfree(trigger); + return 0; } @@ -849,8 +904,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) if (IS_ERR(ep)) { ret = PTR_ERR(ep); - /* This must return -ENOMEM, else there is a bug */ - WARN_ON_ONCE(ret != -ENOMEM); + /* This must return -ENOMEM or missing event, else there is a bug */ + WARN_ON_ONCE(ret != -ENOMEM && ret != -ENODEV); ep = NULL; goto error; } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a6061a69aa84..f01e442716e2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2506,7 +2506,7 @@ find_synthetic_field_var(struct hist_trigger_data *target_hist_data, * events. However, for convenience, users are allowed to directly * specify an event field in an action, which will be automatically * converted into a variable on their behalf. - + * * If a user specifies a field on an event that isn't the event the * histogram currently being defined (the target event histogram), the * only way that can be accomplished is if a new hist trigger is diff --git a/kernel/ucount.c b/kernel/ucount.c index bb51849e6375..eb03f3c68375 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -284,6 +284,55 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) return (new == 0); } +static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, + struct ucounts *last, enum ucount_type type) +{ + struct ucounts *iter, *next; + for (iter = ucounts; iter != last; iter = next) { + long dec = atomic_long_add_return(-1, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + next = iter->ns->ucounts; + if (dec == 0) + put_ucounts(iter); + } +} + +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type) +{ + do_dec_rlimit_put_ucounts(ucounts, NULL, type); +} + +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) +{ + /* Caller must hold a reference to ucounts */ + struct ucounts *iter; + long dec, ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(1, &iter->ucount[type]); + if (new < 0 || new > max) + goto unwind; + if (iter == ucounts) + ret = new; + /* + * Grab an extra ucount reference for the caller when + * the rlimit count was previously 0. + */ + if (new != 1) + continue; + if (!get_ucounts(iter)) + goto dec_unwind; + } + return ret; +dec_unwind: + dec = atomic_long_add_return(-1, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); +unwind: + do_dec_rlimit_put_ucounts(ucounts, iter, type); + return 0; +} + bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) { struct ucounts *iter; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 33a6b4a2443d..1b3eb1e9531f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4830,8 +4830,16 @@ void show_workqueue_state(void) for_each_pwq(pwq, wq) { raw_spin_lock_irqsave(&pwq->pool->lock, flags); - if (pwq->nr_active || !list_empty(&pwq->inactive_works)) + if (pwq->nr_active || !list_empty(&pwq->inactive_works)) { + /* + * Defer printing to avoid deadlocks in console + * drivers that queue work while holding locks + * also taken in their write paths. + */ + printk_deferred_enter(); show_pwq(pwq); + printk_deferred_exit(); + } raw_spin_unlock_irqrestore(&pwq->pool->lock, flags); /* * We could be printing a lot from atomic context, e.g. @@ -4849,7 +4857,12 @@ void show_workqueue_state(void) raw_spin_lock_irqsave(&pool->lock, flags); if (pool->nr_workers == pool->nr_idle) goto next_pool; - + /* + * Defer printing to avoid deadlocks in console drivers that + * queue work while holding locks also taken in their write + * paths. + */ + printk_deferred_enter(); pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); pr_cont(" hung=%us workers=%d", @@ -4864,6 +4877,7 @@ void show_workqueue_state(void) first = false; } pr_cont("\n"); + printk_deferred_exit(); next_pool: raw_spin_unlock_irqrestore(&pool->lock, flags); /* |