diff options
Diffstat (limited to 'kernel')
120 files changed, 4467 insertions, 1876 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d3..d754e0be1176 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # Don't instrument error handlers diff --git a/kernel/acct.c b/kernel/acct.c index 13706356ec54..62200d799b9b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead) unsigned long vsize = 0; if (group_dead && current->mm) { + struct mm_struct *mm = current->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - mmap_read_lock(current->mm); - vma = current->mm->mmap; - while (vma) { + mmap_read_lock(mm); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } spin_lock_irq(¤t->sighand->siglock); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4b0957aa2cd4..65075f1e4ac8 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -133,7 +133,7 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) } /* Initialize a parent watch entry. */ -static struct audit_parent *audit_init_parent(struct path *path) +static struct audit_parent *audit_init_parent(const struct path *path) { struct inode *inode = d_backing_inode(path->dentry); struct audit_parent *parent; diff --git a/kernel/bounds.c b/kernel/bounds.c index 9795d75b09b2..b529182e8b04 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -22,6 +22,13 @@ int main(void) DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); +#ifdef CONFIG_LRU_GEN + DEFINE(LRU_GEN_WIDTH, order_base_2(MAX_NR_GENS + 1)); + DEFINE(__LRU_REFS_WIDTH, MAX_NR_TIERS - 2); +#else + DEFINE(LRU_GEN_WIDTH, 0); + DEFINE(__LRU_REFS_WIDTH, 0); +#endif /* End of constants */ return 0; diff --git a/kernel/bpf/bloom_filter.c b/kernel/bpf/bloom_filter.c index b9ea539a5561..48ee750849f2 100644 --- a/kernel/bpf/bloom_filter.c +++ b/kernel/bpf/bloom_filter.c @@ -158,7 +158,7 @@ static struct bpf_map *bloom_map_alloc(union bpf_attr *attr) attr->value_size / sizeof(u32); if (!(attr->map_flags & BPF_F_ZERO_SEED)) - bloom->hash_seed = get_random_int(); + bloom->hash_seed = get_random_u32(); return &bloom->map; } diff --git a/kernel/bpf/cgroup_iter.c b/kernel/bpf/cgroup_iter.c index 0d200a993489..9fcf09f2ef00 100644 --- a/kernel/bpf/cgroup_iter.c +++ b/kernel/bpf/cgroup_iter.c @@ -196,7 +196,7 @@ static int bpf_iter_attach_cgroup(struct bpf_prog *prog, return -EINVAL; if (fd) - cgrp = cgroup_get_from_fd(fd); + cgrp = cgroup_v1v2_get_from_fd(fd); else if (id) cgrp = cgroup_get_from_id(id); else /* walk the entire hierarchy by default. */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 711fd293b6de..25a54e04560e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1032,7 +1032,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = prandom_u32_max(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; @@ -1094,7 +1094,7 @@ bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); - start = (get_random_int() % hole) & ~(alignment - 1); + start = prandom_u32_max(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; @@ -1216,7 +1216,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, bool emit_zext) { struct bpf_insn *to = to_buff; - u32 imm_rnd = get_random_int(); + u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); @@ -2007,7 +2007,7 @@ out: static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_EXT_REG]; \ + u64 regs[MAX_BPF_EXT_REG] = {}; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ed3f8a53603b..f39ee3e05589 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -527,7 +527,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else - htab->hashrnd = get_random_int(); + htab->hashrnd = get_random_u32(); htab_init_buckets(htab); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 1adbe67cdb95..aecea7451b61 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -338,7 +338,7 @@ BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx, int ret; /* perf_sample_data doesn't have callchain, use bpf_get_stackid */ - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return bpf_get_stackid((unsigned long)(ctx->regs), (unsigned long) map, flags, 0, 0); @@ -506,7 +506,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx, int err = -EINVAL; __u64 nr_kernel; - if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)) return __bpf_get_stack(regs, NULL, NULL, buf, size, flags); if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 67e03e1833ba..c2a2182ce570 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -445,8 +445,8 @@ struct bpf_iter_seq_task_vma_info { }; enum bpf_task_vma_iter_find_op { - task_vma_iter_first_vma, /* use mm->mmap */ - task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_first_vma, /* use find_vma() with addr 0 */ + task_vma_iter_next_vma, /* use vma_next() with curr_vma */ task_vma_iter_find_vma, /* use find_vma() to find next vma */ }; @@ -544,10 +544,10 @@ again: switch (op) { case task_vma_iter_first_vma: - curr_vma = curr_task->mm->mmap; + curr_vma = find_vma(curr_task->mm, 0); break; case task_vma_iter_next_vma: - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma @@ -561,7 +561,7 @@ again: if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; } if (!curr_vma) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9ab7188d8f68..7f0a9f6cb889 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -13351,7 +13351,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, aux[adj_idx].ptr_type == PTR_TO_CTX) continue; - imm_rnd = get_random_int(); + imm_rnd = get_random_u32(); rnd_hi32_patch[0] = insn; rnd_hi32_patch[1].imm = imm_rnd; rnd_hi32_patch[3].dst_reg = load_reg; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 36b740cb3d59..fd4020835ec6 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -164,7 +164,6 @@ struct cgroup_mgctx { #define DEFINE_CGROUP_MGCTX(name) \ struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) -extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; extern struct cgroup_subsys *cgroup_subsys[]; extern struct list_head cgroup_roots; @@ -250,6 +249,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup, int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, bool threadgroup); +void cgroup_attach_lock(bool lock_threadgroup); +void cgroup_attach_unlock(bool lock_threadgroup); struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup, bool *locked) __acquires(&cgroup_threadgroup_rwsem); diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index ff6a8099eb2a..52bb5a74a23b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -59,8 +59,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) int retval = 0; mutex_lock(&cgroup_mutex); - cpus_read_lock(); - percpu_down_write(&cgroup_threadgroup_rwsem); + cgroup_attach_lock(true); for_each_root(root) { struct cgroup *from_cgrp; @@ -72,8 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) if (retval) break; } - percpu_up_write(&cgroup_threadgroup_rwsem); - cpus_read_unlock(); + cgroup_attach_unlock(true); mutex_unlock(&cgroup_mutex); return retval; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8ad2c267ff47..2319946715e0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -217,6 +217,7 @@ struct cgroup_namespace init_cgroup_ns = { static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; +static struct cftype cgroup_psi_files[]; /* cgroup optional features */ enum cgroup_opt_features { @@ -1391,6 +1392,9 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +/* + * Returned cgroup is without refcount but it's valid as long as cset pins it. + */ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { @@ -1402,6 +1406,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, res_cgroup = cset->dfl_cgrp; } else { struct cgrp_cset_link *link; + lockdep_assert_held(&css_set_lock); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; @@ -1413,6 +1418,7 @@ static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, } } + BUG_ON(!res_cgroup); return res_cgroup; } @@ -1435,23 +1441,36 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_unlock(); - BUG_ON(!res); return res; } +/* + * Look up cgroup associated with current task's cgroup namespace on the default + * hierarchy. + * + * Unlike current_cgns_cgroup_from_root(), this doesn't need locks: + * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu + * pointers. + * - css_set_lock is not needed because we just read cset->dfl_cgrp. + * - As a bonus returned cgrp is pinned with the current because it cannot + * switch cgroup_ns asynchronously. + */ +static struct cgroup *current_cgns_cgroup_dfl(void) +{ + struct css_set *cset; + + cset = current->nsproxy->cgroup_ns->root_cset; + return __cset_cgroup_from_root(cset, &cgrp_dfl_root); +} + /* look up cgroup associated with given css_set on the specified hierarchy */ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, struct cgroup_root *root) { - struct cgroup *res = NULL; - lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - res = __cset_cgroup_from_root(cset, root); - - BUG_ON(!res); - return res; + return __cset_cgroup_from_root(cset, root); } /* @@ -1689,12 +1708,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - cgroup_addrm_files(css, cgrp, cfts, false); + if (cgroup_on_dfl(cgrp)) { + cgroup_addrm_files(css, cgrp, + cgroup_base_files, false); + if (cgroup_psi_enabled()) + cgroup_addrm_files(css, cgrp, + cgroup_psi_files, false); + } else { + cgroup_addrm_files(css, cgrp, + cgroup1_base_files, false); + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); @@ -1717,14 +1740,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css) return 0; if (!css->ss) { - if (cgroup_on_dfl(cgrp)) - cfts = cgroup_base_files; - else - cfts = cgroup1_base_files; - - ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - if (ret < 0) - return ret; + if (cgroup_on_dfl(cgrp)) { + ret = cgroup_addrm_files(&cgrp->self, cgrp, + cgroup_base_files, true); + if (ret < 0) + return ret; + + if (cgroup_psi_enabled()) { + ret = cgroup_addrm_files(&cgrp->self, cgrp, + cgroup_psi_files, true); + if (ret < 0) + return ret; + } + } else { + cgroup_addrm_files(css, cgrp, + cgroup1_base_files, true); + } } else { list_for_each_entry(cfts, &css->ss->cfts, node) { ret = cgroup_addrm_files(css, cgrp, cfts, true); @@ -2050,7 +2081,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) } root_cgrp->kn = kernfs_root_to_node(root->kf_root); WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1); - root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp); + root_cgrp->ancestors[0] = root_cgrp; ret = css_populate_dir(&root_cgrp->self); if (ret) @@ -2173,7 +2204,7 @@ static int cgroup_get_tree(struct fs_context *fc) struct cgroup_fs_context *ctx = cgroup_fc2context(fc); int ret; - cgrp_dfl_visible = true; + WRITE_ONCE(cgrp_dfl_visible, true); cgroup_get_live(&cgrp_dfl_root.cgrp); ctx->root = &cgrp_dfl_root; @@ -2361,7 +2392,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ - ret = strlcpy(buf, "/", buflen); + ret = strscpy(buf, "/", buflen); } spin_unlock_irq(&css_set_lock); @@ -2393,7 +2424,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path); * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that * CPU hotplug is disabled on entry. */ -static void cgroup_attach_lock(bool lock_threadgroup) +void cgroup_attach_lock(bool lock_threadgroup) { cpus_read_lock(); if (lock_threadgroup) @@ -2404,7 +2435,7 @@ static void cgroup_attach_lock(bool lock_threadgroup) * cgroup_attach_unlock - Undo cgroup_attach_lock() * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem */ -static void cgroup_attach_unlock(bool lock_threadgroup) +void cgroup_attach_unlock(bool lock_threadgroup) { if (lock_threadgroup) percpu_up_write(&cgroup_threadgroup_rwsem); @@ -3292,11 +3323,7 @@ static int cgroup_apply_control(struct cgroup *cgrp) * making the following cgroup_update_dfl_csses() properly update * css associations of all tasks in the subtree. */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - return ret; - - return 0; + return cgroup_update_dfl_csses(cgrp); } /** @@ -3689,27 +3716,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + struct psi_group *psi = cgroup_psi(cgrp); return psi_show(seq, psi, PSI_CPU); } -static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, enum psi_res res) +static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) { struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; @@ -3729,7 +3756,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; + psi = cgroup_psi(cgrp); new = psi_trigger_create(psi, buf, res); if (IS_ERR(new)) { cgroup_put(cgrp); @@ -3746,21 +3773,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_IO); + return pressure_write(of, buf, nbytes, PSI_IO); } static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); + return pressure_write(of, buf, nbytes, PSI_MEM); } static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); + return pressure_write(of, buf, nbytes, PSI_CPU); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int cgroup_irq_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + return psi_show(seq, psi, PSI_IRQ); +} + +static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return pressure_write(of, buf, nbytes, PSI_IRQ); +} +#endif + +static int cgroup_pressure_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup_psi(cgrp); + + seq_printf(seq, "%d\n", psi->enabled); + + return 0; +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + ssize_t ret; + int enable; + struct cgroup *cgrp; + struct psi_group *psi; + + ret = kstrtoint(strstrip(buf), 0, &enable); + if (ret) + return ret; + + if (enable < 0 || enable > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + psi = cgroup_psi(cgrp); + if (psi->enabled != enable) { + int i; + + /* show or hide {cpu,memory,io,irq}.pressure files */ + for (i = 0; i < NR_PSI_RESOURCES; i++) + cgroup_file_show(&cgrp->psi_files[i], enable); + + psi->enabled = enable; + if (enable) + psi_cgroup_restart(psi); + } + + cgroup_kn_unlock(of->kn); + + return nbytes; } static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, @@ -3780,6 +3872,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) bool cgroup_psi_enabled(void) { + if (static_branch_likely(&psi_disabled)) + return false; + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; } @@ -4132,8 +4227,6 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) @@ -4198,21 +4291,25 @@ static void cgroup_exit_cftypes(struct cftype *cfts) cft->ss = NULL; /* revert flags set by cgroup core while adding @cfts */ - cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL | + __CFTYPE_ADDED); } } static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype *cft; + int ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { struct kernfs_ops *kf_ops; WARN_ON(cft->ss || cft->kf_ops); - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; + if (cft->flags & __CFTYPE_ADDED) { + ret = -EBUSY; + break; + } if (cft->seq_start) kf_ops = &cgroup_kf_ops; @@ -4226,26 +4323,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) { kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL); if (!kf_ops) { - cgroup_exit_cftypes(cfts); - return -ENOMEM; + ret = -ENOMEM; + break; } kf_ops->atomic_write_len = cft->max_write_len; } cft->kf_ops = kf_ops; cft->ss = ss; + cft->flags |= __CFTYPE_ADDED; } - return 0; + if (ret) + cgroup_exit_cftypes(cfts); + return ret; } static int cgroup_rm_cftypes_locked(struct cftype *cfts) { lockdep_assert_held(&cgroup_mutex); - if (!cfts || !cfts[0].ss) - return -ENOENT; - list_del(&cfts->node); cgroup_apply_cftypes(cfts, false); cgroup_exit_cftypes(cfts); @@ -4267,6 +4364,12 @@ int cgroup_rm_cftypes(struct cftype *cfts) { int ret; + if (!cfts || cfts[0].name[0] == '\0') + return 0; + + if (!(cfts[0].flags & __CFTYPE_ADDED)) + return -ENOENT; + mutex_lock(&cgroup_mutex); ret = cgroup_rm_cftypes_locked(cfts); mutex_unlock(&cgroup_mutex); @@ -4372,6 +4475,26 @@ void cgroup_file_notify(struct cgroup_file *cfile) } /** + * cgroup_file_show - show or hide a hidden cgroup file + * @cfile: target cgroup_file obtained by setting cftype->file_offset + * @show: whether to show or hide + */ +void cgroup_file_show(struct cgroup_file *cfile, bool show) +{ + struct kernfs_node *kn; + + spin_lock_irq(&cgroup_file_kn_lock); + kn = cfile->kn; + kernfs_get(kn); + spin_unlock_irq(&cgroup_file_kn_lock); + + if (kn) + kernfs_show(kn, show); + + kernfs_put(kn); +} + +/** * css_next_child - find the next child of a given css * @pos: the current position (%NULL to initiate traversal) * @parent: css whose children to walk @@ -5131,10 +5254,14 @@ static struct cftype cgroup_base_files[] = { .name = "cpu.stat", .seq_show = cpu_stat_show, }, + { } /* terminate */ +}; + +static struct cftype cgroup_psi_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -5142,7 +5269,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -5150,12 +5277,27 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", - .flags = CFTYPE_PRESSURE, + .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, .release = cgroup_pressure_release, }, +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + { + .name = "irq.pressure", + .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), + .seq_show = cgroup_irq_pressure_show, + .write = cgroup_irq_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, + }, +#endif + { + .name = "cgroup.pressure", + .seq_show = cgroup_pressure_show, + .write = cgroup_pressure_write, + }, #endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -5432,8 +5574,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)), - GFP_KERNEL); + cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL); if (!cgrp) return ERR_PTR(-ENOMEM); @@ -5485,7 +5626,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { - cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp); + cgrp->ancestors[tcgrp->level] = tcgrp; if (tcgrp != cgrp) { tcgrp->nr_descendants++; @@ -5918,6 +6059,7 @@ int __init cgroup_init(void) BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); + BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); cgroup_rstat_boot(); @@ -6038,19 +6180,22 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup_get_from_id : get the cgroup associated with cgroup id * @id: cgroup id - * On success return the cgrp, on failure return NULL + * On success return the cgrp or ERR_PTR on failure + * Only cgroups within current task's cgroup NS are valid. */ struct cgroup *cgroup_get_from_id(u64 id) { struct kernfs_node *kn; - struct cgroup *cgrp = NULL; + struct cgroup *cgrp, *root_cgrp; kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id); if (!kn) - goto out; + return ERR_PTR(-ENOENT); - if (kernfs_type(kn) != KERNFS_DIR) - goto put; + if (kernfs_type(kn) != KERNFS_DIR) { + kernfs_put(kn); + return ERR_PTR(-ENOENT); + } rcu_read_lock(); @@ -6059,9 +6204,17 @@ struct cgroup *cgroup_get_from_id(u64 id) cgrp = NULL; rcu_read_unlock(); -put: kernfs_put(kn); -out: + + if (!cgrp) + return ERR_PTR(-ENOENT); + + root_cgrp = current_cgns_cgroup_dfl(); + if (!cgroup_is_descendant(cgrp, root_cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-ENOENT); + } + return cgrp; } EXPORT_SYMBOL_GPL(cgroup_get_from_id); @@ -6091,7 +6244,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_visible) + if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible)) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -6157,16 +6310,42 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -static struct cgroup *cgroup_get_from_file(struct file *f) +/** + * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer + * @f: file corresponding to cgroup_dir + * + * Find the cgroup from a file pointer associated with a cgroup directory. + * Returns a pointer to the cgroup on success. ERR_PTR is returned if the + * cgroup cannot be found. + */ +static struct cgroup *cgroup_v1v2_get_from_file(struct file *f) { struct cgroup_subsys_state *css; - struct cgroup *cgrp; css = css_tryget_online_from_dir(f->f_path.dentry, NULL); if (IS_ERR(css)) return ERR_CAST(css); - cgrp = css->cgroup; + return css->cgroup; +} + +/** + * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports + * cgroup2. + * @f: file corresponding to cgroup2_dir + */ +static struct cgroup *cgroup_get_from_file(struct file *f) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_file(f); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; } @@ -6633,8 +6812,10 @@ struct cgroup *cgroup_get_from_path(const char *path) { struct kernfs_node *kn; struct cgroup *cgrp = ERR_PTR(-ENOENT); + struct cgroup *root_cgrp; - kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + root_cgrp = current_cgns_cgroup_dfl(); + kn = kernfs_walk_and_get(root_cgrp->kn, path); if (!kn) goto out; @@ -6659,15 +6840,15 @@ out: EXPORT_SYMBOL_GPL(cgroup_get_from_path); /** - * cgroup_get_from_fd - get a cgroup pointer from a fd - * @fd: fd obtained by open(cgroup2_dir) + * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd + * @fd: fd obtained by open(cgroup_dir) * * Find the cgroup from a fd which should be obtained * by opening a cgroup directory. Returns a pointer to the * cgroup on success. ERR_PTR is returned if the cgroup * cannot be found. */ -struct cgroup *cgroup_get_from_fd(int fd) +struct cgroup *cgroup_v1v2_get_from_fd(int fd) { struct cgroup *cgrp; struct file *f; @@ -6676,10 +6857,29 @@ struct cgroup *cgroup_get_from_fd(int fd) if (!f) return ERR_PTR(-EBADF); - cgrp = cgroup_get_from_file(f); + cgrp = cgroup_v1v2_get_from_file(f); fput(f); return cgrp; } + +/** + * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports + * cgroup2. + * @fd: fd obtained by open(cgroup2_dir) + */ +struct cgroup *cgroup_get_from_fd(int fd) +{ + struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd); + + if (IS_ERR(cgrp)) + return ERR_CAST(cgrp); + + if (!cgroup_on_dfl(cgrp)) { + cgroup_put(cgrp); + return ERR_PTR(-EBADF); + } + return cgrp; +} EXPORT_SYMBOL_GPL(cgroup_get_from_fd); static u64 power_of_ten(int power) @@ -6792,9 +6992,6 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf, if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; - if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) - continue; - if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); @@ -6814,8 +7011,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr, int ssid; ssize_t ret = 0; - ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret, - NULL); + ret = show_delegatable_files(cgroup_base_files, buf + ret, + PAGE_SIZE - ret, NULL); + if (cgroup_psi_enabled()) + ret += show_delegatable_files(cgroup_psi_files, buf + ret, + PAGE_SIZE - ret, NULL); for_each_subsys(ss, ssid) ret += show_delegatable_files(ss->dfl_cftypes, buf + ret, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1f3a55297f39..b474289c15b8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -33,6 +33,7 @@ #include <linux/interrupt.h> #include <linux/kernel.h> #include <linux/kmod.h> +#include <linux/kthread.h> #include <linux/list.h> #include <linux/mempolicy.h> #include <linux/mm.h> @@ -85,6 +86,30 @@ struct fmeter { spinlock_t lock; /* guards read or write of above */ }; +/* + * Invalid partition error code + */ +enum prs_errcode { + PERR_NONE = 0, + PERR_INVCPUS, + PERR_INVPARENT, + PERR_NOTPART, + PERR_NOTEXCL, + PERR_NOCPUS, + PERR_HOTPLUG, + PERR_CPUSEMPTY, +}; + +static const char * const perr_strings[] = { + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", + [PERR_INVPARENT] = "Parent is an invalid partition root", + [PERR_NOTPART] = "Parent is not a partition root", + [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", + [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", + [PERR_HOTPLUG] = "No cpu available due to hotplug", + [PERR_CPUSEMPTY] = "cpuset.cpus is empty", +}; + struct cpuset { struct cgroup_subsys_state css; @@ -168,6 +193,9 @@ struct cpuset { int use_parent_ecpus; int child_ecpus_count; + /* Invalid partition error code, not lock protected */ + enum prs_errcode prs_err; + /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; }; @@ -175,20 +203,22 @@ struct cpuset { /* * Partition root states: * - * 0 - not a partition root - * + * 0 - member (not a partition root) * 1 - partition root - * + * 2 - partition root without load balancing (isolated) * -1 - invalid partition root - * None of the cpus in cpus_allowed can be put into the parent's - * subparts_cpus. In this case, the cpuset is not a real partition - * root anymore. However, the CPU_EXCLUSIVE bit will still be set - * and the cpuset can be restored back to a partition root if the - * parent cpuset can give more CPUs back to this child cpuset. + * -2 - invalid isolated partition root */ -#define PRS_DISABLED 0 -#define PRS_ENABLED 1 -#define PRS_ERROR -1 +#define PRS_MEMBER 0 +#define PRS_ROOT 1 +#define PRS_ISOLATED 2 +#define PRS_INVALID_ROOT -1 +#define PRS_INVALID_ISOLATED -2 + +static inline bool is_prs_invalid(int prs_state) +{ + return prs_state < 0; +} /* * Temporary cpumasks for working with partitions that are passed among @@ -268,25 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } -static inline int is_partition_root(const struct cpuset *cs) +static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } +static inline int is_partition_invalid(const struct cpuset *cs) +{ + return cs->partition_root_state < 0; +} + +/* + * Callers should hold callback_lock to modify partition_root_state. + */ +static inline void make_partition_invalid(struct cpuset *cs) +{ + if (is_partition_valid(cs)) + cs->partition_root_state = -cs->partition_root_state; +} + /* * Send notification event of whenever partition_root_state changes. */ -static inline void notify_partition_change(struct cpuset *cs, - int old_prs, int new_prs) +static inline void notify_partition_change(struct cpuset *cs, int old_prs) { - if (old_prs != new_prs) - cgroup_file_notify(&cs->partition_file); + if (old_prs == cs->partition_root_state) + return; + cgroup_file_notify(&cs->partition_file); + + /* Reset prs_err if not invalid */ + if (is_partition_valid(cs)) + WRITE_ONCE(cs->prs_err, PERR_NONE); } static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .partition_root_state = PRS_ENABLED, + .partition_root_state = PRS_ROOT, }; /** @@ -404,6 +452,41 @@ static inline bool is_in_v2_mode(void) (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } +/** + * partition_is_populated - check if partition has tasks + * @cs: partition root to be checked + * @excluded_child: a child cpuset to be excluded in task checking + * Return: true if there are tasks, false otherwise + * + * It is assumed that @cs is a valid partition root. @excluded_child should + * be non-NULL when this cpuset is going to become a partition itself. + */ +static inline bool partition_is_populated(struct cpuset *cs, + struct cpuset *excluded_child) +{ + struct cgroup_subsys_state *css; + struct cpuset *child; + + if (cs->css.cgroup->nr_populated_csets) + return true; + if (!excluded_child && !cs->nr_subparts_cpus) + return cgroup_is_populated(cs->css.cgroup); + + rcu_read_lock(); + cpuset_for_each_child(child, css, cs) { + if (child == excluded_child) + continue; + if (is_partition_valid(child)) + continue; + if (cgroup_is_populated(child->css.cgroup)) { + rcu_read_unlock(); + return true; + } + } + rcu_read_unlock(); + return false; +} + /* * Return in pmask the portion of a task's cpusets's cpus_allowed that * are online and are capable of running the task. If none are found, @@ -659,22 +742,6 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) par = parent_cs(cur); /* - * If either I or some sibling (!= me) is exclusive, we can't - * overlap - */ - ret = -EINVAL; - cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur && - cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) - goto out; - if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && - nodes_intersects(trial->mems_allowed, c->mems_allowed)) - goto out; - } - - /* * Cpusets with tasks - existing or newly being attached - can't * be changed to have empty cpus_allowed or mems_allowed. */ @@ -698,6 +765,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) trial->cpus_allowed)) goto out; + /* + * If either I or some sibling (!= me) is exclusive, we can't + * overlap + */ + ret = -EINVAL; + cpuset_for_each_child(c, css, par) { + if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && + c != cur && + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + goto out; + if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && + c != cur && + nodes_intersects(trial->mems_allowed, c->mems_allowed)) + goto out; + } + ret = 0; out: rcu_read_unlock(); @@ -875,7 +958,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree if not a partition root */ - if (!is_partition_root(cp)) + if (!is_partition_valid(cp)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -1081,7 +1164,7 @@ static void rebuild_sched_domains_locked(void) if (top_cpuset.nr_subparts_cpus) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_root(cs)) { + if (!is_partition_valid(cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1127,10 +1210,18 @@ static void update_tasks_cpumask(struct cpuset *cs) { struct css_task_iter it; struct task_struct *task; + bool top_cs = cs == &top_cpuset; css_task_iter_start(&cs->css, 0, &it); - while ((task = css_task_iter_next(&it))) + while ((task = css_task_iter_next(&it))) { + /* + * Percpu kthreads in top_cpuset are ignored + */ + if (top_cs && (task->flags & PF_KTHREAD) && + kthread_is_per_cpu(task)) + continue; set_cpus_allowed_ptr(task, cs->effective_cpus); + } css_task_iter_end(&it); } @@ -1165,15 +1256,18 @@ enum subparts_cmd { partcmd_enable, /* Enable partition root */ partcmd_disable, /* Disable partition root */ partcmd_update, /* Update parent's subparts_cpus */ + partcmd_invalidate, /* Make partition invalid */ }; +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on); /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cpuset: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask - * Return: 0, 1 or an error code + * Return: 0 or a partition root state error code * * For partcmd_enable, the cpuset is being transformed from a non-partition * root to a partition root. The cpus_allowed mask of the given cpuset will @@ -1184,38 +1278,36 @@ enum subparts_cmd { * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 should always be returned. - * - * For partcmd_update, if the optional newmask is specified, the cpu - * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The cpuset should either - * be a partition root or an invalid partition root. The partition root - * state may change if newmask is NULL and none of the requested CPUs can - * be granted by the parent. The function will return 1 if changes to - * parent's subparts_cpus and effective_cpus happen or 0 otherwise. - * Error code should only be returned when newmask is non-NULL. + * into parent's effective_cpus. 0 will always be returned. * - * The partcmd_enable and partcmd_disable commands are used by - * update_prstate(). The partcmd_update command is used by - * update_cpumasks_hier() with newmask NULL and update_cpumask() with - * newmask set. + * For partcmd_update, if the optional newmask is specified, the cpu list is + * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is + * assumed to remain the same. The cpuset should either be a valid or invalid + * partition root. The partition root state may change from valid to invalid + * or vice versa. An error code will only be returned if transitioning from + * invalid to valid violates the exclusivity rule. * - * The checking is more strict when enabling partition root than the - * other two commands. + * For partcmd_invalidate, the current partition will be made invalid. * - * Because of the implicit cpu exclusive nature of a partition root, - * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). + * The partcmd_enable and partcmd_disable commands are used by + * update_prstate(). An error code may be returned and the caller will check + * for error. + * + * The partcmd_update command is used by update_cpumasks_hier() with newmask + * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used + * by update_cpumask() with NULL newmask. In both cases, the callers won't + * check for error and so partition_root_state and prs_error will be updated + * directly. */ -static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, +static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { - struct cpuset *parent = parent_cs(cpuset); + struct cpuset *parent = parent_cs(cs); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ int old_prs, new_prs; - bool part_error = false; /* Partition error? */ + int part_error = PERR_NONE; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1224,126 +1316,165 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * The new cpumask, if present, or the current cpus_allowed must * not be empty. */ - if (!is_partition_root(parent) || - (newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cpuset->cpus_allowed))) - return -EINVAL; - - /* - * Enabling/disabling partition root is not allowed if there are - * online children. - */ - if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css)) - return -EBUSY; - - /* - * Enabling partition root is not allowed if not all the CPUs - * can be granted from parent's effective_cpus or at least one - * CPU will be left after that. - */ - if ((cmd == partcmd_enable) && - (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) || - cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus))) - return -EINVAL; + if (!is_partition_valid(parent)) { + return is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART; + } + if ((newmask && cpumask_empty(newmask)) || + (!newmask && cpumask_empty(cs->cpus_allowed))) + return PERR_CPUSEMPTY; /* - * A cpumask update cannot make parent's effective_cpus become empty. + * new_prs will only be changed for the partcmd_update and + * partcmd_invalidate commands. */ adding = deleting = false; - old_prs = new_prs = cpuset->partition_root_state; + old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_enable) { - cpumask_copy(tmp->addmask, cpuset->cpus_allowed); + /* + * Enabling partition root is not allowed if cpus_allowed + * doesn't overlap parent's cpus_allowed. + */ + if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) + return PERR_INVCPUS; + + /* + * A parent can be left with no CPU as long as there is no + * task directly associated with the parent partition. + */ + if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) && + partition_is_populated(parent, cs)) + return PERR_NOCPUS; + + cpumask_copy(tmp->addmask, cs->cpus_allowed); adding = true; } else if (cmd == partcmd_disable) { - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + /* + * Need to remove cpus from parent's subparts_cpus for valid + * partition root. + */ + deleting = !is_prs_invalid(old_prs) && + cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); + } else if (cmd == partcmd_invalidate) { + if (is_prs_invalid(old_prs)) + return 0; + + /* + * Make the current partition invalid. It is assumed that + * invalidation is caused by violating cpu exclusivity rule. + */ + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, parent->subparts_cpus); + if (old_prs > 0) { + new_prs = -old_prs; + part_error = PERR_NOTEXCL; + } } else if (newmask) { /* * partcmd_update with newmask: * + * Compute add/delete mask to/from subparts_cpus + * * delmask = cpus_allowed & ~newmask & parent->subparts_cpus - * addmask = newmask & parent->effective_cpus + * addmask = newmask & parent->cpus_allowed * & ~parent->subparts_cpus */ - cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); + cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->subparts_cpus); - cpumask_and(tmp->addmask, newmask, parent->effective_cpus); + cpumask_and(tmp->addmask, newmask, parent->cpus_allowed); adding = cpumask_andnot(tmp->addmask, tmp->addmask, parent->subparts_cpus); /* - * Return error if the new effective_cpus could become empty. + * Make partition invalid if parent's effective_cpus could + * become empty and there are tasks in the parent. */ if (adding && - cpumask_equal(parent->effective_cpus, tmp->addmask)) { - if (!deleting) - return -EINVAL; - /* - * As some of the CPUs in subparts_cpus might have - * been offlined, we need to compute the real delmask - * to confirm that. - */ - if (!cpumask_and(tmp->addmask, tmp->delmask, - cpu_active_mask)) - return -EINVAL; - cpumask_copy(tmp->addmask, parent->effective_cpus); + cpumask_subset(parent->effective_cpus, tmp->addmask) && + !cpumask_intersects(tmp->delmask, cpu_active_mask) && + partition_is_populated(parent, cs)) { + part_error = PERR_NOCPUS; + adding = false; + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); } } else { /* * partcmd_update w/o newmask: * - * addmask = cpus_allowed & parent->effective_cpus + * delmask = cpus_allowed & parent->subparts_cpus + * addmask = cpus_allowed & parent->cpus_allowed + * & ~parent->subparts_cpus * - * Note that parent's subparts_cpus may have been - * pre-shrunk in case there is a change in the cpu list. - * So no deletion is needed. + * This gets invoked either due to a hotplug event or from + * update_cpumasks_hier(). This can cause the state of a + * partition root to transition from valid to invalid or vice + * versa. So we still need to compute the addmask and delmask. + + * A partition error happens when: + * 1) Cpuset is valid partition, but parent does not distribute + * out any CPUs. + * 2) Parent has tasks and all its effective CPUs will have + * to be distributed out. */ - adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, - parent->effective_cpus); - part_error = cpumask_equal(tmp->addmask, - parent->effective_cpus); + cpumask_and(tmp->addmask, cs->cpus_allowed, + parent->cpus_allowed); + adding = cpumask_andnot(tmp->addmask, tmp->addmask, + parent->subparts_cpus); + + if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || + (adding && + cpumask_subset(parent->effective_cpus, tmp->addmask) && + partition_is_populated(parent, cs))) { + part_error = PERR_NOCPUS; + adding = false; + } + + if (part_error && is_partition_valid(cs) && + parent->nr_subparts_cpus) + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); } + if (part_error) + WRITE_ONCE(cs->prs_err, part_error); if (cmd == partcmd_update) { - int prev_prs = cpuset->partition_root_state; - /* - * Check for possible transition between PRS_ENABLED - * and PRS_ERROR. + * Check for possible transition between valid and invalid + * partition root. */ - switch (cpuset->partition_root_state) { - case PRS_ENABLED: + switch (cs->partition_root_state) { + case PRS_ROOT: + case PRS_ISOLATED: if (part_error) - new_prs = PRS_ERROR; + new_prs = -old_prs; break; - case PRS_ERROR: + case PRS_INVALID_ROOT: + case PRS_INVALID_ISOLATED: if (!part_error) - new_prs = PRS_ENABLED; + new_prs = -old_prs; break; } - /* - * Set part_error if previously in invalid state. - */ - part_error = (prev_prs == PRS_ERROR); - } - - if (!part_error && (new_prs == PRS_ERROR)) - return 0; /* Nothing need to be done */ - - if (new_prs == PRS_ERROR) { - /* - * Remove all its cpus from parent's subparts_cpus. - */ - adding = false; - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); } if (!adding && !deleting && (new_prs == old_prs)) return 0; /* + * Transitioning between invalid to valid or vice versa may require + * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. + */ + if (old_prs != new_prs) { + if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && + (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) + return PERR_NOTEXCL; + if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) + update_flag(CS_CPU_EXCLUSIVE, cs, 0); + } + + /* * Change the parent's subparts_cpus. * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. @@ -1369,18 +1500,32 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); if (old_prs != new_prs) - cpuset->partition_root_state = new_prs; + cs->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); - notify_partition_change(cpuset, old_prs, new_prs); - return cmd == partcmd_update; + if (adding || deleting) + update_tasks_cpumask(parent); + + /* + * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. + * rebuild_sched_domains_locked() may be called. + */ + if (old_prs != new_prs) { + if (old_prs == PRS_ISOLATED) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); + else if (new_prs == PRS_ISOLATED) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + } + notify_partition_change(cs, old_prs); + return 0; } /* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup + * @force: don't skip any descendant cpusets if set * * When configured cpumask is changed, the effective cpumasks of this cpuset * and all its descendants need to be updated. @@ -1389,7 +1534,8 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * * Called with cpuset_rwsem held */ -static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) +static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -1399,14 +1545,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool update_parent = false; compute_effective_cpumask(tmp->new_cpus, cp, parent); /* * If it becomes empty, inherit the effective mask of the - * parent, which is guaranteed to have some CPUs. + * parent, which is guaranteed to have some CPUs unless + * it is a partition root that has explicitly distributed + * out all its CPUs. */ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { + if (is_partition_valid(cp) && + cpumask_equal(cp->cpus_allowed, cp->subparts_cpus)) + goto update_parent_subparts; + cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; @@ -1420,14 +1573,15 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) /* * Skip the whole subtree if the cpumask remains the same - * and has no partition root state. + * and has no partition root state and force flag not set. */ - if (!cp->partition_root_state && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); continue; } +update_parent_subparts: /* * update_parent_subparts_cpumask() should have been called * for cs already in update_cpumask(). We should also call @@ -1437,36 +1591,22 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { - case PRS_DISABLED: - /* - * If parent is not a partition root or an - * invalid partition root, clear its state - * and its CS_CPU_EXCLUSIVE flag. - */ - WARN_ON_ONCE(cp->partition_root_state - != PRS_ERROR); - new_prs = PRS_DISABLED; - - /* - * clear_bit() is an atomic operation and - * readers aren't interested in the state - * of CS_CPU_EXCLUSIVE anyway. So we can - * just update the flag without holding - * the callback_lock. - */ - clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); + case PRS_ROOT: + case PRS_ISOLATED: + update_parent = true; break; - case PRS_ENABLED: - if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) - update_tasks_cpumask(parent); - break; - - case PRS_ERROR: + default: /* - * When parent is invalid, it has to be too. + * When parent is not a partition root or is + * invalid, child partition roots become + * invalid too. */ - new_prs = PRS_ERROR; + if (is_partition_valid(cp)) + new_prs = -cp->partition_root_state; + WRITE_ONCE(cp->prs_err, + is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART); break; } } @@ -1475,42 +1615,44 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) continue; rcu_read_unlock(); + if (update_parent) { + update_parent_subparts_cpumask(cp, partcmd_update, NULL, + tmp); + /* + * The cpuset partition_root_state may become + * invalid. Capture it. + */ + new_prs = cp->partition_root_state; + } + spin_lock_irq(&callback_lock); - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { + if (cp->nr_subparts_cpus && !is_partition_valid(cp)) { + /* + * Put all active subparts_cpus back to effective_cpus. + */ + cpumask_or(tmp->new_cpus, tmp->new_cpus, + cp->subparts_cpus); + cpumask_and(tmp->new_cpus, tmp->new_cpus, + cpu_active_mask); cp->nr_subparts_cpus = 0; cpumask_clear(cp->subparts_cpus); - } else if (cp->nr_subparts_cpus) { + } + + cpumask_copy(cp->effective_cpus, tmp->new_cpus); + if (cp->nr_subparts_cpus) { /* * Make sure that effective_cpus & subparts_cpus * are mutually exclusive. - * - * In the unlikely event that effective_cpus - * becomes empty. we clear cp->nr_subparts_cpus and - * let its child partition roots to compete for - * CPUs again. */ cpumask_andnot(cp->effective_cpus, cp->effective_cpus, cp->subparts_cpus); - if (cpumask_empty(cp->effective_cpus)) { - cpumask_copy(cp->effective_cpus, tmp->new_cpus); - cpumask_clear(cp->subparts_cpus); - cp->nr_subparts_cpus = 0; - } else if (!cpumask_subset(cp->subparts_cpus, - tmp->new_cpus)) { - cpumask_andnot(cp->subparts_cpus, - cp->subparts_cpus, tmp->new_cpus); - cp->nr_subparts_cpus - = cpumask_weight(cp->subparts_cpus); - } } - if (new_prs != old_prs) - cp->partition_root_state = new_prs; - + cp->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); - notify_partition_change(cp, old_prs, new_prs); + + notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -1526,7 +1668,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_root(cp))) + is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -1570,7 +1712,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -1588,6 +1730,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, { int retval; struct tmpmasks tmp; + bool invalidate = false; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ if (cs == &top_cpuset) @@ -1615,10 +1758,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) return 0; - retval = validate_change(cs, trialcs); - if (retval < 0) - return retval; - #ifdef CONFIG_CPUMASK_OFFSTACK /* * Use the cpumasks in trialcs for tmpmasks when they are pointers @@ -1629,28 +1768,70 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, tmp.new_cpus = trialcs->cpus_allowed; #endif + retval = validate_change(cs, trialcs); + + if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + struct cpuset *cp, *parent; + struct cgroup_subsys_state *css; + + /* + * The -EINVAL error code indicates that partition sibling + * CPU exclusivity rule has been violated. We still allow + * the cpumask change to proceed while invalidating the + * partition. However, any conflicting sibling partitions + * have to be marked as invalid too. + */ + invalidate = true; + rcu_read_lock(); + parent = parent_cs(cs); + cpuset_for_each_child(cp, css, parent) + if (is_partition_valid(cp) && + cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) { + rcu_read_unlock(); + update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp); + rcu_read_lock(); + } + rcu_read_unlock(); + retval = 0; + } + if (retval < 0) + return retval; + if (cs->partition_root_state) { - /* Cpumask of a partition root cannot be empty */ - if (cpumask_empty(trialcs->cpus_allowed)) - return -EINVAL; - if (update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp) < 0) - return -EINVAL; + if (invalidate) + update_parent_subparts_cpumask(cs, partcmd_invalidate, + NULL, &tmp); + else + update_parent_subparts_cpumask(cs, partcmd_update, + trialcs->cpus_allowed, &tmp); } + compute_effective_cpumask(trialcs->effective_cpus, trialcs, + parent_cs(cs)); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); /* - * Make sure that subparts_cpus is a subset of cpus_allowed. + * Make sure that subparts_cpus, if not empty, is a subset of + * cpus_allowed. Clear subparts_cpus if partition not valid or + * empty effective cpus with tasks. */ if (cs->nr_subparts_cpus) { - cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + if (!is_partition_valid(cs) || + (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) && + partition_is_populated(cs, NULL))) { + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + } else { + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, + cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } } spin_unlock_irq(&callback_lock); - update_cpumasks_hier(cs, &tmp); + /* effective_cpus will be updated here */ + update_cpumasks_hier(cs, &tmp, false); if (cs->partition_root_state) { struct cpuset *parent = parent_cs(cs); @@ -2026,16 +2207,18 @@ out: return err; } -/* +/** * update_prstate - update partition_root_state - * cs: the cpuset to update - * new_prs: new partition root state + * @cs: the cpuset to update + * @new_prs: new partition root state + * Return: 0 if successful, != 0 if error * * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { - int err, old_prs = cs->partition_root_state; + int err = PERR_NONE, old_prs = cs->partition_root_state; + bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2043,28 +2226,33 @@ static int update_prstate(struct cpuset *cs, int new_prs) return 0; /* - * Cannot force a partial or invalid partition root to a full - * partition root. + * For a previously invalid partition root, leave it at being + * invalid if new_prs is not "member". */ - if (new_prs && (old_prs == PRS_ERROR)) - return -EINVAL; + if (new_prs && is_prs_invalid(old_prs)) { + cs->partition_root_state = -new_prs; + return 0; + } if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - err = -EINVAL; if (!old_prs) { /* * Turning on partition root requires setting the * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be NULL. + * cannot be empty. */ - if (cpumask_empty(cs->cpus_allowed)) + if (cpumask_empty(cs->cpus_allowed)) { + err = PERR_CPUSEMPTY; goto out; + } err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) + if (err) { + err = PERR_NOTEXCL; goto out; + } err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); @@ -2072,47 +2260,77 @@ static int update_prstate(struct cpuset *cs, int new_prs) update_flag(CS_CPU_EXCLUSIVE, cs, 0); goto out; } + + if (new_prs == PRS_ISOLATED) { + /* + * Disable the load balance flag should not return an + * error unless the system is running out of memory. + */ + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + sched_domain_rebuilt = true; + } + } else if (old_prs && new_prs) { + /* + * A change in load balance state only, no change in cpumasks. + */ + update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); + sched_domain_rebuilt = true; + goto out; /* Sched domain is rebuilt in update_flag() */ } else { /* - * Turning off partition root will clear the - * CS_CPU_EXCLUSIVE bit. + * Switching back to member is always allowed even if it + * disables child partitions. */ - if (old_prs == PRS_ERROR) { - update_flag(CS_CPU_EXCLUSIVE, cs, 0); - err = 0; - goto out; - } + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, + &tmpmask); - err = update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, &tmpmask); - if (err) - goto out; + /* + * If there are child partitions, they will all become invalid. + */ + if (unlikely(cs->nr_subparts_cpus)) { + spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + compute_effective_cpumask(cs->effective_cpus, cs, parent); + spin_unlock_irq(&callback_lock); + } /* Turning off CS_CPU_EXCLUSIVE will not return error */ update_flag(CS_CPU_EXCLUSIVE, cs, 0); + + if (!is_sched_load_balance(cs)) { + /* Make sure load balance is on */ + update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); + sched_domain_rebuilt = true; + } } - /* - * Update cpumask of parent's tasks except when it is the top - * cpuset as some system daemons cannot be mapped to other CPUs. - */ - if (parent != &top_cpuset) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); - rebuild_sched_domains_locked(); + if (!sched_domain_rebuilt) + rebuild_sched_domains_locked(); out: - if (!err) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); - notify_partition_change(cs, old_prs, new_prs); - } + /* + * Make partition invalid if an error happen + */ + if (err) + new_prs = -new_prs; + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + /* + * Update child cpusets, if present. + * Force update if switching back to member. + */ + if (!list_empty(&cs->css.children)) + update_cpumasks_hier(cs, &tmpmask, !new_prs); + notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); - return err; + return 0; } /* @@ -2238,6 +2456,12 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; + /* + * Task cannot be moved to a cpuset with empty effective cpus. + */ + if (cpumask_empty(cs->effective_cpus)) + goto out_unlock; + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->effective_cpus); if (ret) @@ -2598,16 +2822,29 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); + const char *err, *type = NULL; switch (cs->partition_root_state) { - case PRS_ENABLED: + case PRS_ROOT: seq_puts(seq, "root\n"); break; - case PRS_DISABLED: + case PRS_ISOLATED: + seq_puts(seq, "isolated\n"); + break; + case PRS_MEMBER: seq_puts(seq, "member\n"); break; - case PRS_ERROR: - seq_puts(seq, "root invalid\n"); + case PRS_INVALID_ROOT: + type = "root"; + fallthrough; + case PRS_INVALID_ISOLATED: + if (!type) + type = "isolated"; + err = perr_strings[READ_ONCE(cs->prs_err)]; + if (err) + seq_printf(seq, "%s invalid (%s)\n", type, err); + else + seq_printf(seq, "%s invalid\n", type); break; } return 0; @@ -2626,9 +2863,11 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, * Convert "root" to ENABLED, and convert "member" to DISABLED. */ if (!strcmp(buf, "root")) - val = PRS_ENABLED; + val = PRS_ROOT; else if (!strcmp(buf, "member")) - val = PRS_DISABLED; + val = PRS_MEMBER; + else if (!strcmp(buf, "isolated")) + val = PRS_ISOLATED; else return -EINVAL; @@ -2927,7 +3166,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); percpu_down_write(&cpuset_rwsem); - if (is_partition_root(cs)) + if (is_partition_valid(cs)) update_prstate(cs, 0); if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && @@ -3103,7 +3342,8 @@ hotplug_update_tasks(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, bool cpus_updated, bool mems_updated) { - if (cpumask_empty(new_cpus)) + /* A partition root is allowed to have empty effective cpus */ + if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; @@ -3172,11 +3412,31 @@ retry: /* * In the unlikely event that a partition root has empty - * effective_cpus or its parent becomes erroneous, we have to - * transition it to the erroneous state. + * effective_cpus with tasks, we will have to invalidate child + * partitions, if present, by setting nr_subparts_cpus to 0 to + * reclaim their cpus. */ - if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || - (parent->partition_root_state == PRS_ERROR))) { + if (cs->nr_subparts_cpus && is_partition_valid(cs) && + cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { + spin_lock_irq(&callback_lock); + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + spin_unlock_irq(&callback_lock); + compute_effective_cpumask(&new_cpus, cs, parent); + } + + /* + * Force the partition to become invalid if either one of + * the following conditions hold: + * 1) empty effective cpus but not valid empty partition. + * 2) parent is invalid or doesn't grant any cpus to child + * partitions. + */ + if (is_partition_valid(cs) && (!parent->nr_subparts_cpus || + (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { + int old_prs, parent_prs; + + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); if (cs->nr_subparts_cpus) { spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; @@ -3185,39 +3445,32 @@ retry: compute_effective_cpumask(&new_cpus, cs, parent); } - /* - * If the effective_cpus is empty because the child - * partitions take away all the CPUs, we can keep - * the current partition and let the child partitions - * fight for available CPUs. - */ - if ((parent->partition_root_state == PRS_ERROR) || - cpumask_empty(&new_cpus)) { - int old_prs; - - update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, tmp); - old_prs = cs->partition_root_state; - if (old_prs != PRS_ERROR) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = PRS_ERROR; - spin_unlock_irq(&callback_lock); - notify_partition_change(cs, old_prs, PRS_ERROR); - } + old_prs = cs->partition_root_state; + parent_prs = parent->partition_root_state; + if (is_partition_valid(cs)) { + spin_lock_irq(&callback_lock); + make_partition_invalid(cs); + spin_unlock_irq(&callback_lock); + if (is_prs_invalid(parent_prs)) + WRITE_ONCE(cs->prs_err, PERR_INVPARENT); + else if (!parent_prs) + WRITE_ONCE(cs->prs_err, PERR_NOTPART); + else + WRITE_ONCE(cs->prs_err, PERR_HOTPLUG); + notify_partition_change(cs, old_prs); } cpuset_force_rebuild(); } /* - * On the other hand, an erroneous partition root may be transitioned - * back to a regular one or a partition root with no CPU allocated - * from the parent may change to erroneous. + * On the other hand, an invalid partition root may be transitioned + * back to a regular one. */ - if (is_partition_root(parent) && - ((cs->partition_root_state == PRS_ERROR) || - !cpumask_intersects(&new_cpus, parent->subparts_cpus)) && - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) - cpuset_force_rebuild(); + else if (is_partition_valid(parent) && is_partition_invalid(cs)) { + update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp); + if (is_partition_valid(cs)) + cpuset_force_rebuild(); + } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 08236798d173..1b6b21851e9d 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) if (parent && (parent->state & CGROUP_FREEZING)) { freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); } mutex_unlock(&freezer_mutex); @@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css) mutex_lock(&freezer_mutex); if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); freezer->state = 0; @@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset) __thaw_task(task); } else { freeze_task(task); + /* clear FROZEN and propagate upwards */ while (freezer && (freezer->state & CGROUP_FROZEN)) { freezer->state &= ~CGROUP_FROZEN; @@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) css_task_iter_start(css, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } + if (freezing(task) && !frozen(task)) + goto out_iter_end; } freezer->state |= CGROUP_FROZEN; @@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); freezer->state |= state; freeze_cgroup(freezer); } else { @@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, freezer->state &= ~state; if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); freezer->state &= ~CGROUP_FROZEN; + if (was_freezing) + static_branch_dec(&freezer_active); unfreeze_cgroup(freezer); } } diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 511af87f685e..7695e60bcb40 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -47,6 +47,7 @@ struct pids_cgroup { */ atomic64_t counter; atomic64_t limit; + int64_t watermark; /* Handle for "pids.events" */ struct cgroup_file events_file; @@ -85,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css) kfree(css_pids(css)); } +static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids) +{ + /* + * This is racy, but we don't need perfectly accurate tallying of + * the watermark, and this lets us avoid extra atomic overhead. + */ + if (nr_pids > READ_ONCE(p->watermark)) + WRITE_ONCE(p->watermark, nr_pids); +} + /** * pids_cancel - uncharge the local pid count * @pids: the pid cgroup state @@ -128,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; parent_pids(p); p = parent_pids(p)) - atomic64_add(num, &p->counter); + for (p = pids; parent_pids(p); p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + pids_update_watermark(p, new); + } } /** @@ -156,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) */ if (new > limit) goto revert; + + /* + * Not technically accurate if we go over limit somewhere up + * the hierarchy, but that's tolerable for the watermark. + */ + pids_update_watermark(p, new); } return 0; @@ -311,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css, return atomic64_read(&pids->counter); } +static s64 pids_peak_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return READ_ONCE(pids->watermark); +} + static int pids_events_show(struct seq_file *sf, void *v) { struct pids_cgroup *pids = css_pids(seq_css(sf)); @@ -332,6 +360,11 @@ static struct cftype pids_files[] = { .flags = CFTYPE_NOT_ON_ROOT, }, { + .name = "peak", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = pids_peak_read, + }, + { .name = "events", .seq_show = pids_events_show, .file_offset = offsetof(struct pids_cgroup, events_file), diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7beceb447211..d5e9ccde3ab8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -50,7 +50,6 @@ #include <linux/pid.h> #include <linux/smp.h> #include <linux/mm.h> -#include <linux/vmacache.h> #include <linux/rcupdate.h> #include <linux/irq.h> #include <linux/security.h> @@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm) { - int i; - - for (i = 0; i < VMACACHE_SIZE; i++) { - if (!current->vmacache.vmas[i]) - continue; - flush_cache_range(current->vmacache.vmas[i], - addr, addr + BREAK_INSTR_SIZE); - } - } - /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 164ed9ef77a3..e39cb696cfbd 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -214,13 +214,22 @@ void __delayacct_freepages_end(void) ¤t->delays->freepages_count); } -void __delayacct_thrashing_start(void) +void __delayacct_thrashing_start(bool *in_thrashing) { + *in_thrashing = !!current->in_thrashing; + if (*in_thrashing) + return; + + current->in_thrashing = 1; current->delays->thrashing_start = local_clock(); } -void __delayacct_thrashing_end(void) +void __delayacct_thrashing_end(bool *in_thrashing) { + if (*in_thrashing) + return; + + current->in_thrashing = 0; delayacct_end(¤t->delays->lock, ¤t->delays->thrashing_start, ¤t->delays->thrashing_delay, diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 27f272381cf2..33437d620644 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -10,6 +10,7 @@ #include <linux/dma-map-ops.h> #include <linux/export.h> #include <linux/gfp.h> +#include <linux/kmsan.h> #include <linux/of_device.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); + kmsan_handle_dma(page, offset, size, dir); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; @@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, else ents = ops->map_sg(dev, sg, nents, dir, attrs); - if (ents > 0) + if (ents > 0) { + kmsan_handle_dma_sg(sg, nents, dir); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); - else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != -EREMOTEIO)) + } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != -EREMOTEIO)) { return -EIO; + } return ents; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 0ef6b12f961d..339a990554e7 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -346,22 +346,27 @@ retry: memblock_free(tlb, PAGE_ALIGN(bytes)); nslabs = ALIGN(nslabs >> 1, IO_TLB_SEGSIZE); - if (nslabs < IO_TLB_MIN_SLABS) - panic("%s: Failed to remap %zu bytes\n", - __func__, bytes); - goto retry; + if (nslabs >= IO_TLB_MIN_SLABS) + goto retry; + + pr_warn("%s: Failed to remap %zu bytes\n", __func__, bytes); + return; } alloc_size = PAGE_ALIGN(array_size(sizeof(*mem->slots), nslabs)); mem->slots = memblock_alloc(alloc_size, PAGE_SIZE); - if (!mem->slots) - panic("%s: Failed to allocate %zu bytes align=0x%lx\n", - __func__, alloc_size, PAGE_SIZE); + if (!mem->slots) { + pr_warn("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + return; + } mem->areas = memblock_alloc(array_size(sizeof(struct io_tlb_area), default_nareas), SMP_CACHE_BYTES); - if (!mem->areas) - panic("%s: Failed to allocate mem->areas.\n", __func__); + if (!mem->areas) { + pr_warn("%s: Failed to allocate mem->areas.\n", __func__); + return; + } swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, flags, false, default_nareas); @@ -545,9 +550,8 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size } if (PageHighMem(pfn_to_page(pfn))) { - /* The buffer does not have a mapping. Map it in and copy */ unsigned int offset = orig_addr & ~PAGE_MASK; - char *buffer; + struct page *page; unsigned int sz = 0; unsigned long flags; @@ -555,12 +559,11 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); - buffer = kmap_atomic(pfn_to_page(pfn)); + page = pfn_to_page(pfn); if (dir == DMA_TO_DEVICE) - memcpy(vaddr, buffer + offset, sz); + memcpy_from_page(vaddr, page, offset, sz); else - memcpy(buffer + offset, vaddr, sz); - kunmap_atomic(buffer); + memcpy_to_page(page, offset, vaddr, sz); local_irq_restore(flags); size -= sz; @@ -731,8 +734,11 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, int index; phys_addr_t tlb_addr; - if (!mem || !mem->nslabs) - panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + if (!mem || !mem->nslabs) { + dev_warn_ratelimited(dev, + "Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); + return (phys_addr_t)DMA_MAPPING_ERROR; + } if (cc_platform_has(CC_ATTR_MEM_ENCRYPT)) pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 063068a9ea9b..846add8394c4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include <linux/resume_user_mode.h> #include <linux/highmem.h> #include <linux/jump_label.h> +#include <linux/kmsan.h> #include <linux/livepatch.h> #include <linux/audit.h> #include <linux/tick.h> @@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) user_exit_irqoff(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); } @@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); @@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) */ lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) ct_nmi_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); diff --git a/kernel/events/Makefile b/kernel/events/Makefile index 8591c180b52b..91a62f566743 100644 --- a/kernel/events/Makefile +++ b/kernel/events/Makefile @@ -2,4 +2,5 @@ obj-y := core.o ring_buffer.o callchain.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o +obj-$(CONFIG_HW_BREAKPOINT_KUNIT_TEST) += hw_breakpoint_test.o obj-$(CONFIG_UPROBES) += uprobes.o diff --git a/kernel/events/core.c b/kernel/events/core.c index ff4bffc502c6..aefc1e08e015 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1468,6 +1468,8 @@ static void __update_context_time(struct perf_event_context *ctx, bool adv) { u64 now = perf_clock(); + lockdep_assert_held(&ctx->lock); + if (adv) ctx->time += now - ctx->timestamp; ctx->timestamp = now; @@ -2224,16 +2226,22 @@ static inline int __pmu_filter_match(struct perf_event *event) static inline int pmu_filter_match(struct perf_event *event) { struct perf_event *sibling; + unsigned long flags; + int ret = 1; if (!__pmu_filter_match(event)) return 0; + local_irq_save(flags); for_each_sibling_event(sibling, event) { - if (!__pmu_filter_match(sibling)) - return 0; + if (!__pmu_filter_match(sibling)) { + ret = 0; + break; + } } + local_irq_restore(flags); - return 1; + return ret; } static inline int @@ -6794,11 +6802,10 @@ out_put: static void __perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_event *event) + struct perf_event *event, + u64 sample_type) { - u64 sample_type = event->attr.sample_type; - - data->type = sample_type; + data->type = event->attr.sample_type; header->size += event->id_header_size; if (sample_type & PERF_SAMPLE_TID) { @@ -6827,7 +6834,7 @@ void perf_event_header__init_id(struct perf_event_header *header, struct perf_event *event) { if (event->attr.sample_id_all) - __perf_event_header__init_id(header, data, event); + __perf_event_header__init_id(header, data, event, event->attr.sample_type); } static void __perf_event__output_id_sample(struct perf_output_handle *handle, @@ -6976,11 +6983,6 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, event, enabled, running); } -static inline bool perf_sample_save_hw_index(struct perf_event *event) -{ - return event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_HW_INDEX; -} - void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, @@ -7062,14 +7064,14 @@ void perf_output_sample(struct perf_output_handle *handle, } if (sample_type & PERF_SAMPLE_BRANCH_STACK) { - if (data->br_stack) { + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { size_t size; size = data->br_stack->nr * sizeof(struct perf_branch_entry); perf_output_put(handle, data->br_stack->nr); - if (perf_sample_save_hw_index(event)) + if (branch_sample_hw_index(event)) perf_output_put(handle, data->br_stack->hw_idx); perf_output_copy(handle, data->br_stack->entries, size); } else { @@ -7312,6 +7314,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct pt_regs *regs) { u64 sample_type = event->attr.sample_type; + u64 filtered_sample_type; header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header) + event->header_size; @@ -7319,7 +7322,12 @@ void perf_prepare_sample(struct perf_event_header *header, header->misc = 0; header->misc |= perf_misc_flags(regs); - __perf_event_header__init_id(header, data, event); + /* + * Clear the sample flags that have already been done by the + * PMU driver. + */ + filtered_sample_type = sample_type & ~data->sample_flags; + __perf_event_header__init_id(header, data, event, filtered_sample_type); if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); @@ -7327,7 +7335,7 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; - if (!(sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY)) + if (filtered_sample_type & PERF_SAMPLE_CALLCHAIN) data->callchain = perf_callchain(event, regs); size += data->callchain->nr; @@ -7339,7 +7347,7 @@ void perf_prepare_sample(struct perf_event_header *header, struct perf_raw_record *raw = data->raw; int size; - if (raw) { + if (raw && (data->sample_flags & PERF_SAMPLE_RAW)) { struct perf_raw_frag *frag = &raw->frag; u32 sum = 0; @@ -7355,6 +7363,7 @@ void perf_prepare_sample(struct perf_event_header *header, frag->pad = raw->size - sum; } else { size = sizeof(u64); + data->raw = NULL; } header->size += size; @@ -7362,8 +7371,8 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_BRANCH_STACK) { int size = sizeof(u64); /* nr */ - if (data->br_stack) { - if (perf_sample_save_hw_index(event)) + if (data->sample_flags & PERF_SAMPLE_BRANCH_STACK) { + if (branch_sample_hw_index(event)) size += sizeof(u64); size += data->br_stack->nr @@ -7412,6 +7421,20 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } + if (filtered_sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = 0; + + if (filtered_sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = PERF_MEM_NA; + + if (filtered_sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = 0; + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_DATA_PAGE_SIZE)) { + if (filtered_sample_type & PERF_SAMPLE_ADDR) + data->addr = 0; + } + if (sample_type & PERF_SAMPLE_REGS_INTR) { /* regs dump ABI info */ int size = sizeof(u64); @@ -7427,7 +7450,8 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += size; } - if (sample_type & PERF_SAMPLE_PHYS_ADDR) + if (sample_type & PERF_SAMPLE_PHYS_ADDR && + filtered_sample_type & PERF_SAMPLE_PHYS_ADDR) data->phys_addr = perf_virt_to_phys(data->addr); #ifdef CONFIG_CGROUP_PERF @@ -9998,8 +10022,16 @@ static void bpf_overflow_handler(struct perf_event *event, goto out; rcu_read_lock(); prog = READ_ONCE(event->prog); - if (prog) + if (prog) { + if (prog->call_get_stack && + (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) && + !(data->sample_flags & PERF_SAMPLE_CALLCHAIN)) { + data->callchain = perf_callchain(event, regs); + data->sample_flags |= PERF_SAMPLE_CALLCHAIN; + } + ret = bpf_prog_run(prog, &ctx); + } rcu_read_unlock(); out: __this_cpu_dec(bpf_prog_active); @@ -10025,7 +10057,7 @@ static int perf_event_set_bpf_handler(struct perf_event *event, if (event->attr.precise_ip && prog->call_get_stack && - (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) || + (!(event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) || event->attr.exclude_callchain_kernel || event->attr.exclude_callchain_user)) { /* @@ -10238,8 +10270,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; @@ -10942,7 +10975,7 @@ static ssize_t nr_addr_filters_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters); } DEVICE_ATTR_RO(nr_addr_filters); @@ -10953,7 +10986,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page) { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->type); } static DEVICE_ATTR_RO(type); @@ -10964,7 +10997,7 @@ perf_event_mux_interval_ms_show(struct device *dev, { struct pmu *pmu = dev_get_drvdata(dev); - return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); + return scnprintf(page, PAGE_SIZE - 1, "%d\n", pmu->hrtimer_interval_ms); } static DEFINE_MUTEX(mux_interval_mutex); @@ -11718,11 +11751,9 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: - if (event->ns) - put_pid_ns(event->ns); if (event->hw.target) put_task_struct(event->hw.target); - kmem_cache_free(perf_event_cache, event); + call_rcu(&event->rcu_head, free_event_rcu); return ERR_PTR(err); } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index f32320ac02fd..c3797701339c 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -17,61 +17,276 @@ * This file contains the arch-independent routines. */ +#include <linux/hw_breakpoint.h> + +#include <linux/atomic.h> +#include <linux/bug.h> +#include <linux/cpu.h> +#include <linux/export.h> +#include <linux/init.h> #include <linux/irqflags.h> -#include <linux/kallsyms.h> -#include <linux/notifier.h> -#include <linux/kprobes.h> #include <linux/kdebug.h> #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/percpu-rwsem.h> #include <linux/percpu.h> +#include <linux/rhashtable.h> #include <linux/sched.h> -#include <linux/init.h> #include <linux/slab.h> -#include <linux/list.h> -#include <linux/cpu.h> -#include <linux/smp.h> -#include <linux/bug.h> -#include <linux/hw_breakpoint.h> /* - * Constraints data + * Datastructure to track the total uses of N slots across tasks or CPUs; + * bp_slots_histogram::count[N] is the number of assigned N+1 breakpoint slots. + */ +struct bp_slots_histogram { +#ifdef hw_breakpoint_slots + atomic_t count[hw_breakpoint_slots(0)]; +#else + atomic_t *count; +#endif +}; + +/* + * Per-CPU constraints data. */ struct bp_cpuinfo { - /* Number of pinned cpu breakpoints in a cpu */ - unsigned int cpu_pinned; - /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */ - unsigned int *tsk_pinned; - /* Number of non-pinned cpu/task breakpoints in a cpu */ - unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */ + /* Number of pinned CPU breakpoints in a CPU. */ + unsigned int cpu_pinned; + /* Histogram of pinned task breakpoints in a CPU. */ + struct bp_slots_histogram tsk_pinned; }; static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]); -static int nr_slots[TYPE_MAX]; static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type) { return per_cpu_ptr(bp_cpuinfo + type, cpu); } +/* Number of pinned CPU breakpoints globally. */ +static struct bp_slots_histogram cpu_pinned[TYPE_MAX]; +/* Number of pinned CPU-independent task breakpoints. */ +static struct bp_slots_histogram tsk_pinned_all[TYPE_MAX]; + /* Keep track of the breakpoints attached to tasks */ -static LIST_HEAD(bp_task_head); +static struct rhltable task_bps_ht; +static const struct rhashtable_params task_bps_ht_params = { + .head_offset = offsetof(struct hw_perf_event, bp_list), + .key_offset = offsetof(struct hw_perf_event, target), + .key_len = sizeof_field(struct hw_perf_event, target), + .automatic_shrinking = true, +}; -static int constraints_initialized; +static bool constraints_initialized __ro_after_init; -/* Gather the number of total pinned and un-pinned bp in a cpuset */ -struct bp_busy_slots { - unsigned int pinned; - unsigned int flexible; -}; +/* + * Synchronizes accesses to the per-CPU constraints; the locking rules are: + * + * 1. Atomic updates to bp_cpuinfo::tsk_pinned only require a held read-lock + * (due to bp_slots_histogram::count being atomic, no update are lost). + * + * 2. Holding a write-lock is required for computations that require a + * stable snapshot of all bp_cpuinfo::tsk_pinned. + * + * 3. In all other cases, non-atomic accesses require the appropriately held + * lock (read-lock for read-only accesses; write-lock for reads/writes). + */ +DEFINE_STATIC_PERCPU_RWSEM(bp_cpuinfo_sem); -/* Serialize accesses to the above constraints */ -static DEFINE_MUTEX(nr_bp_mutex); +/* + * Return mutex to serialize accesses to per-task lists in task_bps_ht. Since + * rhltable synchronizes concurrent insertions/deletions, independent tasks may + * insert/delete concurrently; therefore, a mutex per task is sufficient. + * + * Uses task_struct::perf_event_mutex, to avoid extending task_struct with a + * hw_breakpoint-only mutex, which may be infrequently used. The caveat here is + * that hw_breakpoint may contend with per-task perf event list management. The + * assumption is that perf usecases involving hw_breakpoints are very unlikely + * to result in unnecessary contention. + */ +static inline struct mutex *get_task_bps_mutex(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; -__weak int hw_breakpoint_weight(struct perf_event *bp) + return tsk ? &tsk->perf_event_mutex : NULL; +} + +static struct mutex *bp_constraints_lock(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) { + /* + * Fully analogous to the perf_try_init_event() nesting + * argument in the comment near perf_event_ctx_lock_nested(); + * this child->perf_event_mutex cannot ever deadlock against + * the parent->perf_event_mutex usage from + * perf_event_task_{en,dis}able(). + * + * Specifically, inherited events will never occur on + * ->perf_event_list. + */ + mutex_lock_nested(tsk_mtx, SINGLE_DEPTH_NESTING); + percpu_down_read(&bp_cpuinfo_sem); + } else { + percpu_down_write(&bp_cpuinfo_sem); + } + + return tsk_mtx; +} + +static void bp_constraints_unlock(struct mutex *tsk_mtx) +{ + if (tsk_mtx) { + percpu_up_read(&bp_cpuinfo_sem); + mutex_unlock(tsk_mtx); + } else { + percpu_up_write(&bp_cpuinfo_sem); + } +} + +static bool bp_constraints_is_locked(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + return percpu_is_write_locked(&bp_cpuinfo_sem) || + (tsk_mtx ? mutex_is_locked(tsk_mtx) : + percpu_is_read_locked(&bp_cpuinfo_sem)); +} + +static inline void assert_bp_constraints_lock_held(struct perf_event *bp) +{ + struct mutex *tsk_mtx = get_task_bps_mutex(bp); + + if (tsk_mtx) + lockdep_assert_held(tsk_mtx); + lockdep_assert_held(&bp_cpuinfo_sem); +} + +#ifdef hw_breakpoint_slots +/* + * Number of breakpoint slots is constant, and the same for all types. + */ +static_assert(hw_breakpoint_slots(TYPE_INST) == hw_breakpoint_slots(TYPE_DATA)); +static inline int hw_breakpoint_slots_cached(int type) { return hw_breakpoint_slots(type); } +static inline int init_breakpoint_slots(void) { return 0; } +#else +/* + * Dynamic number of breakpoint slots. + */ +static int __nr_bp_slots[TYPE_MAX] __ro_after_init; + +static inline int hw_breakpoint_slots_cached(int type) +{ + return __nr_bp_slots[type]; +} + +static __init bool +bp_slots_histogram_alloc(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + hist->count = kcalloc(hw_breakpoint_slots_cached(type), sizeof(*hist->count), GFP_KERNEL); + return hist->count; +} + +static __init void bp_slots_histogram_free(struct bp_slots_histogram *hist) +{ + kfree(hist->count); +} + +static __init int init_breakpoint_slots(void) +{ + int i, cpu, err_cpu; + + for (i = 0; i < TYPE_MAX; i++) + __nr_bp_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + struct bp_cpuinfo *info = get_bp_info(cpu, i); + + if (!bp_slots_histogram_alloc(&info->tsk_pinned, i)) + goto err; + } + } + for (i = 0; i < TYPE_MAX; i++) { + if (!bp_slots_histogram_alloc(&cpu_pinned[i], i)) + goto err; + if (!bp_slots_histogram_alloc(&tsk_pinned_all[i], i)) + goto err; + } + + return 0; +err: + for_each_possible_cpu(err_cpu) { + for (i = 0; i < TYPE_MAX; i++) + bp_slots_histogram_free(&get_bp_info(err_cpu, i)->tsk_pinned); + if (err_cpu == cpu) + break; + } + for (i = 0; i < TYPE_MAX; i++) { + bp_slots_histogram_free(&cpu_pinned[i]); + bp_slots_histogram_free(&tsk_pinned_all[i]); + } + + return -ENOMEM; +} +#endif + +static inline void +bp_slots_histogram_add(struct bp_slots_histogram *hist, int old, int val) +{ + const int old_idx = old - 1; + const int new_idx = old_idx + val; + + if (old_idx >= 0) + WARN_ON(atomic_dec_return_relaxed(&hist->count[old_idx]) < 0); + if (new_idx >= 0) + WARN_ON(atomic_inc_return_relaxed(&hist->count[new_idx]) < 0); +} + +static int +bp_slots_histogram_max(struct bp_slots_histogram *hist, enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count = atomic_read(&hist->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist->count[i]); + if (count > 0) + return i + 1; + WARN(count < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +static int +bp_slots_histogram_max_merge(struct bp_slots_histogram *hist1, struct bp_slots_histogram *hist2, + enum bp_type_idx type) +{ + for (int i = hw_breakpoint_slots_cached(type) - 1; i >= 0; i--) { + const int count1 = atomic_read(&hist1->count[i]); + const int count2 = atomic_read(&hist2->count[i]); + + /* Catch unexpected writers; we want a stable snapshot. */ + ASSERT_EXCLUSIVE_WRITER(hist1->count[i]); + ASSERT_EXCLUSIVE_WRITER(hist2->count[i]); + if (count1 + count2 > 0) + return i + 1; + WARN(count1 < 0, "inconsistent breakpoint slots histogram"); + WARN(count2 < 0, "inconsistent breakpoint slots histogram"); + } + + return 0; +} + +#ifndef hw_breakpoint_weight +static inline int hw_breakpoint_weight(struct perf_event *bp) { return 1; } +#endif static inline enum bp_type_idx find_slot_idx(u64 bp_type) { @@ -82,39 +297,61 @@ static inline enum bp_type_idx find_slot_idx(u64 bp_type) } /* - * Report the maximum number of pinned breakpoints a task - * have in this cpu + * Return the maximum number of pinned breakpoints a task has in this CPU. */ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) { - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int i; + struct bp_slots_histogram *tsk_pinned = &get_bp_info(cpu, type)->tsk_pinned; - for (i = nr_slots[type] - 1; i >= 0; i--) { - if (tsk_pinned[i] > 0) - return i + 1; - } - - return 0; + /* + * At this point we want to have acquired the bp_cpuinfo_sem as a + * writer to ensure that there are no concurrent writers in + * toggle_bp_task_slot() to tsk_pinned, and we get a stable snapshot. + */ + lockdep_assert_held_write(&bp_cpuinfo_sem); + return bp_slots_histogram_max_merge(tsk_pinned, &tsk_pinned_all[type], type); } /* * Count the number of breakpoints of the same type and same task. * The given event must be not on the list. + * + * If @cpu is -1, but the result of task_bp_pinned() is not CPU-independent, + * returns a negative value. */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.target; + struct rhlist_head *head, *pos; struct perf_event *iter; int count = 0; - list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.target == tsk && - find_slot_idx(iter->attr.bp_type) == type && - (iter->cpu < 0 || cpu == iter->cpu)) - count += hw_breakpoint_weight(iter); + /* + * We need a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); + + rcu_read_lock(); + head = rhltable_lookup(&task_bps_ht, &bp->hw.target, task_bps_ht_params); + if (!head) + goto out; + + rhl_for_each_entry_rcu(iter, pos, head, hw.bp_list) { + if (find_slot_idx(iter->attr.bp_type) != type) + continue; + + if (iter->cpu >= 0) { + if (cpu == -1) { + count = -1; + goto out; + } else if (cpu != iter->cpu) + continue; + } + + count += hw_breakpoint_weight(iter); } +out: + rcu_read_unlock(); return count; } @@ -126,16 +363,29 @@ static const struct cpumask *cpumask_of_bp(struct perf_event *bp) } /* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). + * Returns the max pinned breakpoint slots in a given + * CPU (cpu > -1) or across all of them (cpu = -1). */ -static void -fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, - enum bp_type_idx type) +static int +max_bp_pinned_slots(struct perf_event *bp, enum bp_type_idx type) { const struct cpumask *cpumask = cpumask_of_bp(bp); + int pinned_slots = 0; int cpu; + if (bp->hw.target && bp->cpu < 0) { + int max_pinned = task_bp_pinned(-1, bp, type); + + if (max_pinned >= 0) { + /* + * Fast path: task_bp_pinned() is CPU-independent and + * returns the same value for any CPU. + */ + max_pinned += bp_slots_histogram_max(&cpu_pinned[type], type); + return max_pinned; + } + } + for_each_cpu(cpu, cpumask) { struct bp_cpuinfo *info = get_bp_info(cpu, type); int nr; @@ -146,71 +396,131 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, else nr += task_bp_pinned(cpu, bp, type); - if (nr > slots->pinned) - slots->pinned = nr; - - nr = info->flexible; - if (nr > slots->flexible) - slots->flexible = nr; + pinned_slots = max(nr, pinned_slots); } -} -/* - * For now, continue to consider flexible as pinned, until we can - * ensure no flexible event can ever be scheduled before a pinned event - * in a same cpu. - */ -static void -fetch_this_slot(struct bp_busy_slots *slots, int weight) -{ - slots->pinned += weight; -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct perf_event *bp, int cpu, - enum bp_type_idx type, int weight) -{ - unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned; - int old_idx, new_idx; - - old_idx = task_bp_pinned(cpu, bp, type) - 1; - new_idx = old_idx + weight; - - if (old_idx >= 0) - tsk_pinned[old_idx]--; - if (new_idx >= 0) - tsk_pinned[new_idx]++; + return pinned_slots; } /* * Add/remove the given breakpoint in our constraint table */ -static void -toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, - int weight) +static int +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { - const struct cpumask *cpumask = cpumask_of_bp(bp); - int cpu; + int cpu, next_tsk_pinned; if (!enable) weight = -weight; - /* Pinned counter cpu profiling */ if (!bp->hw.target) { - get_bp_info(bp->cpu, type)->cpu_pinned += weight; - return; + /* + * Update the pinned CPU slots, in per-CPU bp_cpuinfo and in the + * global histogram. + */ + struct bp_cpuinfo *info = get_bp_info(bp->cpu, type); + + lockdep_assert_held_write(&bp_cpuinfo_sem); + bp_slots_histogram_add(&cpu_pinned[type], info->cpu_pinned, weight); + info->cpu_pinned += weight; + return 0; + } + + /* + * If bp->hw.target, tsk_pinned is only modified, but not used + * otherwise. We can permit concurrent updates as long as there are no + * other uses: having acquired bp_cpuinfo_sem as a reader allows + * concurrent updates here. Uses of tsk_pinned will require acquiring + * bp_cpuinfo_sem as a writer to stabilize tsk_pinned's value. + */ + lockdep_assert_held_read(&bp_cpuinfo_sem); + + /* + * Update the pinned task slots, in per-CPU bp_cpuinfo and in the global + * histogram. We need to take care of 4 cases: + * + * 1. This breakpoint targets all CPUs (cpu < 0), and there may only + * exist other task breakpoints targeting all CPUs. In this case we + * can simply update the global slots histogram. + * + * 2. This breakpoint targets a specific CPU (cpu >= 0), but there may + * only exist other task breakpoints targeting all CPUs. + * + * a. On enable: remove the existing breakpoints from the global + * slots histogram and use the per-CPU histogram. + * + * b. On disable: re-insert the existing breakpoints into the global + * slots histogram and remove from per-CPU histogram. + * + * 3. Some other existing task breakpoints target specific CPUs. Only + * update the per-CPU slots histogram. + */ + + if (!enable) { + /* + * Remove before updating histograms so we can determine if this + * was the last task breakpoint for a specific CPU. + */ + int ret = rhltable_remove(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + if (ret) + return ret; + } + /* + * Note: If !enable, next_tsk_pinned will not count the to-be-removed breakpoint. + */ + next_tsk_pinned = task_bp_pinned(-1, bp, type); + + if (next_tsk_pinned >= 0) { + if (bp->cpu < 0) { /* Case 1: fast path */ + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, weight); + } else if (enable) { /* Case 2.a: slow path */ + /* Add existing to per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + 0, next_tsk_pinned); + } + /* Add this first CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], next_tsk_pinned, + -next_tsk_pinned); + } else { /* Case 2.b: slow path */ + /* Remove this last CPU-pinned task breakpoint. */ + bp_slots_histogram_add(&get_bp_info(bp->cpu, type)->tsk_pinned, + next_tsk_pinned + hw_breakpoint_weight(bp), weight); + /* Remove all from per-CPU histograms. */ + for_each_possible_cpu(cpu) { + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, -next_tsk_pinned); + } + /* Rebalance global task pinned histogram. */ + bp_slots_histogram_add(&tsk_pinned_all[type], 0, next_tsk_pinned); + } + } else { /* Case 3: slow path */ + const struct cpumask *cpumask = cpumask_of_bp(bp); + + for_each_cpu(cpu, cpumask) { + next_tsk_pinned = task_bp_pinned(cpu, bp, type); + if (!enable) + next_tsk_pinned += hw_breakpoint_weight(bp); + bp_slots_histogram_add(&get_bp_info(cpu, type)->tsk_pinned, + next_tsk_pinned, weight); + } } - /* Pinned counter task profiling */ - for_each_cpu(cpu, cpumask) - toggle_bp_task_slot(bp, cpu, type, weight); + /* + * Readers want a stable snapshot of the per-task breakpoint list. + */ + assert_bp_constraints_lock_held(bp); if (enable) - list_add_tail(&bp->hw.bp_list, &bp_task_head); - else - list_del(&bp->hw.bp_list); + return rhltable_insert(&task_bps_ht, &bp->hw.bp_list, task_bps_ht_params); + + return 0; } __weak int arch_reserve_bp_slot(struct perf_event *bp) @@ -234,7 +544,12 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) } /* - * Constraints to check before allowing this new breakpoint counter: + * Constraints to check before allowing this new breakpoint counter. + * + * Note: Flexible breakpoints are currently unimplemented, but outlined in the + * below algorithm for completeness. The implementation treats flexible as + * pinned due to no guarantee that we currently always schedule flexible events + * before a pinned event in a same CPU. * * == Non-pinned counter == (Considered as pinned for now) * @@ -276,8 +591,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp) */ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) { - struct bp_busy_slots slots = {0}; enum bp_type_idx type; + int max_pinned_slots; int weight; int ret; @@ -293,36 +608,24 @@ static int __reserve_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - fetch_bp_busy_slots(&slots, bp, type); - /* - * Simulate the addition of this breakpoint to the constraints - * and see the result. - */ - fetch_this_slot(&slots, weight); - - /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + /* Check if this new breakpoint can be satisfied across all CPUs. */ + max_pinned_slots = max_bp_pinned_slots(bp, type) + weight; + if (max_pinned_slots > hw_breakpoint_slots_cached(type)) return -ENOSPC; ret = arch_reserve_bp_slot(bp); if (ret) return ret; - toggle_bp_slot(bp, true, type, weight); - - return 0; + return toggle_bp_slot(bp, true, type, weight); } int reserve_bp_slot(struct perf_event *bp) { - int ret; - - mutex_lock(&nr_bp_mutex); - - ret = __reserve_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __reserve_bp_slot(bp, bp->attr.bp_type); + bp_constraints_unlock(mtx); return ret; } @@ -335,17 +638,16 @@ static void __release_bp_slot(struct perf_event *bp, u64 bp_type) type = find_slot_idx(bp_type); weight = hw_breakpoint_weight(bp); - toggle_bp_slot(bp, false, type, weight); + WARN_ON(toggle_bp_slot(bp, false, type, weight)); } void release_bp_slot(struct perf_event *bp) { - mutex_lock(&nr_bp_mutex); + struct mutex *mtx = bp_constraints_lock(bp); arch_unregister_hw_breakpoint(bp); __release_bp_slot(bp, bp->attr.bp_type); - - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); } static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) @@ -372,11 +674,10 @@ static int __modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) { - int ret; + struct mutex *mtx = bp_constraints_lock(bp); + int ret = __modify_bp_slot(bp, old_type, new_type); - mutex_lock(&nr_bp_mutex); - ret = __modify_bp_slot(bp, old_type, new_type); - mutex_unlock(&nr_bp_mutex); + bp_constraints_unlock(mtx); return ret; } @@ -387,18 +688,28 @@ static int modify_bp_slot(struct perf_event *bp, u64 old_type, u64 new_type) */ int dbg_reserve_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + int ret; + + if (bp_constraints_is_locked(bp)) return -1; - return __reserve_bp_slot(bp, bp->attr.bp_type); + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); + ret = __reserve_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); + + return ret; } int dbg_release_bp_slot(struct perf_event *bp) { - if (mutex_is_locked(&nr_bp_mutex)) + if (bp_constraints_is_locked(bp)) return -1; + /* Locks aren't held; disable lockdep assert checking. */ + lockdep_off(); __release_bp_slot(bp, bp->attr.bp_type); + lockdep_on(); return 0; } @@ -604,6 +915,50 @@ void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) } EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); +/** + * hw_breakpoint_is_used - check if breakpoints are currently used + * + * Returns: true if breakpoints are used, false otherwise. + */ +bool hw_breakpoint_is_used(void) +{ + int cpu; + + if (!constraints_initialized) + return false; + + for_each_possible_cpu(cpu) { + for (int type = 0; type < TYPE_MAX; ++type) { + struct bp_cpuinfo *info = get_bp_info(cpu, type); + + if (info->cpu_pinned) + return true; + + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + if (atomic_read(&info->tsk_pinned.count[slot])) + return true; + } + } + } + + for (int type = 0; type < TYPE_MAX; ++type) { + for (int slot = 0; slot < hw_breakpoint_slots_cached(type); ++slot) { + /* + * Warn, because if there are CPU pinned counters, + * should never get here; bp_cpuinfo::cpu_pinned should + * be consistent with the global cpu_pinned histogram. + */ + if (WARN_ON(atomic_read(&cpu_pinned[type].count[slot]))) + return true; + + if (atomic_read(&tsk_pinned_all[type].count[slot])) + return true; + } + } + + return false; +} + static struct notifier_block hw_breakpoint_exceptions_nb = { .notifier_call = hw_breakpoint_exceptions_notify, /* we need to be notified first */ @@ -678,38 +1033,19 @@ static struct pmu perf_breakpoint = { int __init init_hw_breakpoint(void) { - int cpu, err_cpu; - int i; - - for (i = 0; i < TYPE_MAX; i++) - nr_slots[i] = hw_breakpoint_slots(i); + int ret; - for_each_possible_cpu(cpu) { - for (i = 0; i < TYPE_MAX; i++) { - struct bp_cpuinfo *info = get_bp_info(cpu, i); + ret = rhltable_init(&task_bps_ht, &task_bps_ht_params); + if (ret) + return ret; - info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int), - GFP_KERNEL); - if (!info->tsk_pinned) - goto err_alloc; - } - } + ret = init_breakpoint_slots(); + if (ret) + return ret; - constraints_initialized = 1; + constraints_initialized = true; perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); return register_die_notifier(&hw_breakpoint_exceptions_nb); - - err_alloc: - for_each_possible_cpu(err_cpu) { - for (i = 0; i < TYPE_MAX; i++) - kfree(get_bp_info(err_cpu, i)->tsk_pinned); - if (err_cpu == cpu) - break; - } - - return -ENOMEM; } - - diff --git a/kernel/events/hw_breakpoint_test.c b/kernel/events/hw_breakpoint_test.c new file mode 100644 index 000000000000..5ced822df788 --- /dev/null +++ b/kernel/events/hw_breakpoint_test.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for hw_breakpoint constraints accounting logic. + * + * Copyright (C) 2022, Google LLC. + */ + +#include <kunit/test.h> +#include <linux/cpumask.h> +#include <linux/hw_breakpoint.h> +#include <linux/kthread.h> +#include <linux/perf_event.h> +#include <asm/hw_breakpoint.h> + +#define TEST_REQUIRES_BP_SLOTS(test, slots) \ + do { \ + if ((slots) > get_test_bp_slots()) { \ + kunit_skip((test), "Requires breakpoint slots: %d > %d", slots, \ + get_test_bp_slots()); \ + } \ + } while (0) + +#define TEST_EXPECT_NOSPC(expr) KUNIT_EXPECT_EQ(test, -ENOSPC, PTR_ERR(expr)) + +#define MAX_TEST_BREAKPOINTS 512 + +static char break_vars[MAX_TEST_BREAKPOINTS]; +static struct perf_event *test_bps[MAX_TEST_BREAKPOINTS]; +static struct task_struct *__other_task; + +static struct perf_event *register_test_bp(int cpu, struct task_struct *tsk, int idx) +{ + struct perf_event_attr attr = {}; + + if (WARN_ON(idx < 0 || idx >= MAX_TEST_BREAKPOINTS)) + return NULL; + + hw_breakpoint_init(&attr); + attr.bp_addr = (unsigned long)&break_vars[idx]; + attr.bp_len = HW_BREAKPOINT_LEN_1; + attr.bp_type = HW_BREAKPOINT_RW; + return perf_event_create_kernel_counter(&attr, cpu, tsk, NULL, NULL); +} + +static void unregister_test_bp(struct perf_event **bp) +{ + if (WARN_ON(IS_ERR(*bp))) + return; + if (WARN_ON(!*bp)) + return; + unregister_hw_breakpoint(*bp); + *bp = NULL; +} + +static int get_test_bp_slots(void) +{ + static int slots; + + if (!slots) + slots = hw_breakpoint_slots(TYPE_DATA); + + return slots; +} + +static void fill_one_bp_slot(struct kunit *test, int *id, int cpu, struct task_struct *tsk) +{ + struct perf_event *bp = register_test_bp(cpu, tsk, *id); + + KUNIT_ASSERT_NOT_NULL(test, bp); + KUNIT_ASSERT_FALSE(test, IS_ERR(bp)); + KUNIT_ASSERT_NULL(test, test_bps[*id]); + test_bps[(*id)++] = bp; +} + +/* + * Fills up the given @cpu/@tsk with breakpoints, only leaving @skip slots free. + * + * Returns true if this can be called again, continuing at @id. + */ +static bool fill_bp_slots(struct kunit *test, int *id, int cpu, struct task_struct *tsk, int skip) +{ + for (int i = 0; i < get_test_bp_slots() - skip; ++i) + fill_one_bp_slot(test, id, cpu, tsk); + + return *id + get_test_bp_slots() <= MAX_TEST_BREAKPOINTS; +} + +static int dummy_kthread(void *arg) +{ + return 0; +} + +static struct task_struct *get_other_task(struct kunit *test) +{ + struct task_struct *tsk; + + if (__other_task) + return __other_task; + + tsk = kthread_create(dummy_kthread, NULL, "hw_breakpoint_dummy_task"); + KUNIT_ASSERT_FALSE(test, IS_ERR(tsk)); + __other_task = tsk; + return __other_task; +} + +static int get_test_cpu(int num) +{ + int cpu; + + WARN_ON(num < 0); + + for_each_online_cpu(cpu) { + if (num-- <= 0) + break; + } + + return cpu; +} + +/* ===== Test cases ===== */ + +static void test_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), NULL, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_many_cpus(struct kunit *test) +{ + int idx = 0; + int cpu; + + /* Test that CPUs are independent. */ + for_each_online_cpu(cpu) { + bool do_continue = fill_bp_slots(test, &idx, cpu, NULL, 0); + + TEST_EXPECT_NOSPC(register_test_bp(cpu, NULL, idx)); + if (!do_continue) + break; + } +} + +static void test_one_task_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, -1, current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one and adding back CPU-target should work. */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_two_tasks_on_all_cpus(struct kunit *test) +{ + int idx = 0; + + /* Test that tasks are independent. */ + fill_bp_slots(test, &idx, -1, current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Remove one from first task and adding back CPU-target should not work. */ + unregister_test_bp(&test_bps[0]); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_one_task_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* + * Remove one and adding back CPU-target should work; this case is + * special vs. above because the task's constraints are CPU-dependent. + */ + unregister_test_bp(&test_bps[0]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); +} + +static void test_one_task_mixed(struct kunit *test) +{ + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_bp_slots(test, &idx, -1, current, 1); + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* Transition from CPU-dependent pinned count to CPU-independent. */ + unregister_test_bp(&test_bps[0]); + unregister_test_bp(&test_bps[1]); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + fill_one_bp_slot(test, &idx, get_test_cpu(0), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); +} + +static void test_two_tasks_on_one_cpu(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, get_test_cpu(0), get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Can still create breakpoints on some other CPU. */ + fill_bp_slots(test, &idx, get_test_cpu(1), NULL, 0); +} + +static void test_two_tasks_on_one_all_cpus(struct kunit *test) +{ + int idx = 0; + + fill_bp_slots(test, &idx, get_test_cpu(0), current, 0); + fill_bp_slots(test, &idx, -1, get_other_task(test), 0); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(-1, get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), get_other_task(test), idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + /* Cannot create breakpoints on some other CPU either. */ + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static void test_task_on_all_and_one_cpu(struct kunit *test) +{ + int tsk_on_cpu_idx, cpu_idx; + int idx = 0; + + TEST_REQUIRES_BP_SLOTS(test, 3); + + fill_bp_slots(test, &idx, -1, current, 2); + /* Transitioning from only all CPU breakpoints to mixed. */ + tsk_on_cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(0), current); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + + /* We should still be able to use up another CPU's slots. */ + cpu_idx = idx; + fill_one_bp_slot(test, &idx, get_test_cpu(1), NULL); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); + + /* Transitioning back to task target on all CPUs. */ + unregister_test_bp(&test_bps[tsk_on_cpu_idx]); + /* Still have a CPU target breakpoint in get_test_cpu(1). */ + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + /* Remove it and try again. */ + unregister_test_bp(&test_bps[cpu_idx]); + fill_one_bp_slot(test, &idx, -1, current); + + TEST_EXPECT_NOSPC(register_test_bp(-1, current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), current, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(0), NULL, idx)); + TEST_EXPECT_NOSPC(register_test_bp(get_test_cpu(1), NULL, idx)); +} + +static struct kunit_case hw_breakpoint_test_cases[] = { + KUNIT_CASE(test_one_cpu), + KUNIT_CASE(test_many_cpus), + KUNIT_CASE(test_one_task_on_all_cpus), + KUNIT_CASE(test_two_tasks_on_all_cpus), + KUNIT_CASE(test_one_task_on_one_cpu), + KUNIT_CASE(test_one_task_mixed), + KUNIT_CASE(test_two_tasks_on_one_cpu), + KUNIT_CASE(test_two_tasks_on_one_all_cpus), + KUNIT_CASE(test_task_on_all_and_one_cpu), + {}, +}; + +static int test_init(struct kunit *test) +{ + /* Most test cases want 2 distinct CPUs. */ + if (num_online_cpus() < 2) + return -EINVAL; + + /* Want the system to not use breakpoints elsewhere. */ + if (hw_breakpoint_is_used()) + return -EBUSY; + + return 0; +} + +static void test_exit(struct kunit *test) +{ + for (int i = 0; i < MAX_TEST_BREAKPOINTS; ++i) { + if (test_bps[i]) + unregister_test_bp(&test_bps[i]); + } + + if (__other_task) { + kthread_stop(__other_task); + __other_task = NULL; + } + + /* Verify that internal state agrees that no breakpoints are in use. */ + KUNIT_EXPECT_FALSE(test, hw_breakpoint_is_used()); +} + +static struct kunit_suite hw_breakpoint_test_suite = { + .name = "hw_breakpoint", + .test_cases = hw_breakpoint_test_cases, + .init = test_init, + .exit = test_exit, +}; + +kunit_test_suites(&hw_breakpoint_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marco Elver <elver@google.com>"); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2eaa327f8158..d9e357b7e17c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include <linux/export.h> #include <linux/rmap.h> /* anon_vma_prepare */ #include <linux/mmu_notifier.h> /* set_pte_at_notify */ -#include <linux/swap.h> /* try_to_free_swap */ +#include <linux/swap.h> /* folio_free_swap */ #include <linux/ptrace.h> /* user_enable_single_step */ #include <linux/kdebug.h> /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,8 +154,10 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -163,14 +165,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } - /* For try_to_free_swap() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -179,14 +181,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); + folio_get(new_folio); page_add_new_anon_rmap(new_page, vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -198,15 +200,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, vma, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } @@ -349,9 +351,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -552,7 +555,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } @@ -1231,11 +1234,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1983,9 +1987,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* diff --git a/kernel/exit.c b/kernel/exit.c index 84021b24f79e..35e0a31a0315 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -60,6 +60,7 @@ #include <linux/writeback.h> #include <linux/shm.h> #include <linux/kcov.h> +#include <linux/kmsan.h> #include <linux/random.h> #include <linux/rcuwait.h> #include <linux/compat.h> @@ -183,6 +184,10 @@ void put_task_struct_rcu_user(struct task_struct *task) call_rcu(&task->rcu, delayed_put_task_struct); } +void __weak release_thread(struct task_struct *dead_task) +{ +} + void release_task(struct task_struct *p) { struct task_struct *leader; @@ -374,10 +379,10 @@ static void coredump_task_exit(struct task_struct *tsk) complete(&core_state->startup); for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE|TASK_FREEZABLE); if (!self.task) /* see coredump_finish() */ break; - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } @@ -466,6 +471,7 @@ assign_new_owner: goto retry; } WRITE_ONCE(mm->owner, c); + lru_gen_migrate_mm(mm); task_unlock(c); put_task_struct(c); } @@ -733,14 +739,33 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif +static void synchronize_group_exit(struct task_struct *tsk, long code) +{ + struct sighand_struct *sighand = tsk->sighand; + struct signal_struct *signal = tsk->signal; + + spin_lock_irq(&sighand->siglock); + signal->quick_threads--; + if ((signal->quick_threads == 0) && + !(signal->flags & SIGNAL_GROUP_EXIT)) { + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = code; + signal->group_stop_count = 0; + } + spin_unlock_irq(&sighand->siglock); +} + void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; + synchronize_group_exit(tsk, code); + WARN_ON(tsk->plug); kcov_task_exit(tsk); + kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); @@ -905,7 +930,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (sig->group_exec_task) exit_code = 0; - else if (!thread_group_empty(current)) { + else { struct sighand_struct *const sighand = current->sighand; spin_lock_irq(&sighand->siglock); diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 60dc825ecc2b..a7ccd2930c5f 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -247,15 +247,11 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, /* cut off if it is too long */ if (count > KSYM_NAME_LEN) count = KSYM_NAME_LEN; - buf = kmalloc(count + 1, GFP_KERNEL); - if (!buf) - return -ENOMEM; - if (copy_from_user(buf, buffer, count)) { - ret = -EFAULT; - goto out_free; - } - buf[count] = '\0'; + buf = memdup_user_nul(buffer, count); + if (IS_ERR(buf)) + return PTR_ERR(buf); + sym = strstrip(buf); mutex_lock(&fei_lock); @@ -298,17 +294,15 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, } ret = register_kprobe(&attr->kp); - if (!ret) - fei_debugfs_add_attr(attr); - if (ret < 0) - fei_attr_remove(attr); - else { - list_add_tail(&attr->list, &fei_attr_list); - ret = count; + if (ret) { + fei_attr_free(attr); + goto out; } + fei_debugfs_add_attr(attr); + list_add_tail(&attr->list, &fei_attr_list); + ret = count; out: mutex_unlock(&fei_lock); -out_free: kfree(buf); return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 2b6bd511c6ed..08969f5aa38d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,13 +37,13 @@ #include <linux/fdtable.h> #include <linux/iocontext.h> #include <linux/key.h> +#include <linux/kmsan.h> #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mm_inline.h> -#include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> @@ -97,7 +97,6 @@ #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> -#include <linux/sched/mm.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -475,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); - new->vm_next = new->vm_prev = NULL; dup_anon_vma_name(orig, new); } return new; @@ -580,11 +578,12 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; + struct vm_area_struct *mpnt, *tmp; int retval; - unsigned long charge; + unsigned long charge = 0; LIST_HEAD(uf); + MA_STATE(old_mas, &oldmm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, 0, 0); uprobe_start_dup_mmap(); if (mmap_write_lock_killable(oldmm)) { @@ -606,16 +605,16 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; khugepaged_fork(mm, oldmm); - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + retval = mas_expected_entries(&mas, oldmm->map_count); + if (retval) + goto out; + + mas_for_each(&old_mas, mpnt, ULONG_MAX) { struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { @@ -629,7 +628,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, */ if (fatal_signal_pending(current)) { retval = -EINTR; - goto out; + goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); @@ -675,24 +674,17 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only + * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; + hugetlb_dup_vma_private(tmp); - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; + /* Link the vma into the MT */ + mas.index = tmp->vm_start; + mas.last = tmp->vm_end - 1; + mas_store(&mas, tmp); + if (mas_is_err(&mas)) + goto fail_nomem_mas_store; mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) @@ -702,10 +694,12 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_ops->open(tmp); if (retval) - goto out; + goto loop_out; } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); +loop_out: + mas_destroy(&mas); out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); @@ -714,6 +708,9 @@ out: fail_uprobe_end: uprobe_end_dup_mmap(); return retval; + +fail_nomem_mas_store: + unlink_anon_vmas(tmp); fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: @@ -721,7 +718,7 @@ fail_nomem_policy: fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); - goto out; + goto loop_out; } static inline int mm_alloc_pgd(struct mm_struct *mm) @@ -925,13 +922,13 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) + for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); - set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); + set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", @@ -1026,6 +1023,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->worker_private = NULL; kcov_task_init(tsk); + kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION @@ -1109,9 +1107,8 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - mm->mmap = NULL; - mm->mm_rb = RB_ROOT; - mm->vmacache_seqnum = 0; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); + mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); @@ -1152,6 +1149,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext; mm->user_ns = get_user_ns(user_ns); + lru_gen_init_mm(mm); return mm; fail_nocontext: @@ -1194,6 +1192,7 @@ static inline void __mmput(struct mm_struct *mm) } if (mm->binfmt) module_put(mm->binfmt->module); + lru_gen_del_mm(mm); mmdrop(mm); } @@ -1285,13 +1284,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, - &old_exe_file->f_path)) + &old_exe_file->f_path)) { ret = -EBUSY; + break; + } } mmap_read_unlock(mm); fput(old_exe_file); @@ -1421,13 +1423,12 @@ static void complete_vfork_done(struct task_struct *tsk) static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { + unsigned int state = TASK_UNINTERRUPTIBLE|TASK_KILLABLE|TASK_FREEZABLE; int killed; - freezer_do_not_count(); cgroup_enter_frozen(); - killed = wait_for_completion_killable(vfork); + killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); - freezer_count(); if (killed) { task_lock(child); @@ -1567,9 +1568,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; - /* initialize the new vmacache entries */ - vmacache_flush(tsk); - if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; @@ -1693,6 +1691,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; sig->nr_threads = 1; + sig->quick_threads = 1; atomic_set(&sig->live, 1); refcount_set(&sig->sigcnt, 1); @@ -2116,7 +2115,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_free; retval = -EAGAIN; - if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { + if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_cleanup_count; @@ -2460,6 +2459,7 @@ static __latent_entropy struct task_struct *copy_process( __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; + current->signal->quick_threads++; atomic_inc(¤t->signal->live); refcount_inc(¤t->signal->sigcnt); task_join_group_stop(p); @@ -2692,6 +2692,13 @@ pid_t kernel_clone(struct kernel_clone_args *args) get_task_struct(p); } + if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) { + /* lock the task to synchronize with memcg migration */ + task_lock(p); + lru_gen_add_mm(p->mm); + task_unlock(p); + } + wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ diff --git a/kernel/freezer.c b/kernel/freezer.c index 45ab36ffd0e7..4fad0e6fca64 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -13,10 +13,11 @@ #include <linux/kthread.h> /* total number of freezing conditions in effect */ -atomic_t system_freezing_cnt = ATOMIC_INIT(0); -EXPORT_SYMBOL(system_freezing_cnt); +DEFINE_STATIC_KEY_FALSE(freezer_active); +EXPORT_SYMBOL(freezer_active); -/* indicate whether PM freezing is in effect, protected by +/* + * indicate whether PM freezing is in effect, protected by * system_transition_mutex */ bool pm_freezing; @@ -29,7 +30,7 @@ static DEFINE_SPINLOCK(freezer_lock); * freezing_slow_path - slow path for testing whether a task needs to be frozen * @p: task to be tested * - * This function is called by freezing() if system_freezing_cnt isn't zero + * This function is called by freezing() if freezer_active isn't zero * and tests whether @p needs to enter and stay in frozen state. Can be * called under any context. The freezers are responsible for ensuring the * target tasks see the updated state. @@ -52,41 +53,40 @@ bool freezing_slow_path(struct task_struct *p) } EXPORT_SYMBOL(freezing_slow_path); +bool frozen(struct task_struct *p) +{ + return READ_ONCE(p->__state) & TASK_FROZEN; +} + /* Refrigerator is place where frozen processes are stored :-). */ bool __refrigerator(bool check_kthr_stop) { - /* Hmm, should we be allowed to suspend when there are realtime - processes around? */ + unsigned int state = get_current_state(); bool was_frozen = false; - unsigned int save = get_current_state(); pr_debug("%s entered refrigerator\n", current->comm); + WARN_ON_ONCE(state && !(state & TASK_NORMAL)); + for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); + bool freeze; + + set_current_state(TASK_FROZEN); spin_lock_irq(&freezer_lock); - current->flags |= PF_FROZEN; - if (!freezing(current) || - (check_kthr_stop && kthread_should_stop())) - current->flags &= ~PF_FROZEN; + freeze = freezing(current) && !(check_kthr_stop && kthread_should_stop()); spin_unlock_irq(&freezer_lock); - if (!(current->flags & PF_FROZEN)) + if (!freeze) break; + was_frozen = true; schedule(); } + __set_current_state(TASK_RUNNING); pr_debug("%s left refrigerator\n", current->comm); - /* - * Restore saved task state before returning. The mb'd version - * needs to be used; otherwise, it might silently break - * synchronization which depends on ordered task state change. - */ - set_current_state(save); - return was_frozen; } EXPORT_SYMBOL(__refrigerator); @@ -101,6 +101,44 @@ static void fake_signal_wake_up(struct task_struct *p) } } +static int __set_task_frozen(struct task_struct *p, void *arg) +{ + unsigned int state = READ_ONCE(p->__state); + + if (p->on_rq) + return 0; + + if (p != current && task_curr(p)) + return 0; + + if (!(state & (TASK_FREEZABLE | __TASK_STOPPED | __TASK_TRACED))) + return 0; + + /* + * Only TASK_NORMAL can be augmented with TASK_FREEZABLE, since they + * can suffer spurious wakeups. + */ + if (state & TASK_FREEZABLE) + WARN_ON_ONCE(!(state & TASK_NORMAL)); + +#ifdef CONFIG_LOCKDEP + /* + * It's dangerous to freeze with locks held; there be dragons there. + */ + if (!(state & __TASK_FREEZABLE_UNSAFE)) + WARN_ON_ONCE(debug_locks && p->lockdep_depth); +#endif + + WRITE_ONCE(p->__state, TASK_FROZEN); + return TASK_FROZEN; +} + +static bool __freeze_task(struct task_struct *p) +{ + /* TASK_FREEZABLE|TASK_STOPPED|TASK_TRACED -> TASK_FROZEN */ + return task_call_func(p, __set_task_frozen, NULL); +} + /** * freeze_task - send a freeze request to given task * @p: task to send the request to @@ -116,20 +154,8 @@ bool freeze_task(struct task_struct *p) { unsigned long flags; - /* - * This check can race with freezer_do_not_count, but worst case that - * will result in an extra wakeup being sent to the task. It does not - * race with freezer_count(), the barriers in freezer_count() and - * freezer_should_skip() ensure that either freezer_count() sees - * freezing == true in try_to_freeze() and freezes, or - * freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task - * normally. - */ - if (freezer_should_skip(p)) - return false; - spin_lock_irqsave(&freezer_lock, flags); - if (!freezing(p) || frozen(p)) { + if (!freezing(p) || frozen(p) || __freeze_task(p)) { spin_unlock_irqrestore(&freezer_lock, flags); return false; } @@ -137,19 +163,52 @@ bool freeze_task(struct task_struct *p) if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); else - wake_up_state(p, TASK_INTERRUPTIBLE); + wake_up_state(p, TASK_NORMAL); spin_unlock_irqrestore(&freezer_lock, flags); return true; } +/* + * The special task states (TASK_STOPPED, TASK_TRACED) keep their canonical + * state in p->jobctl. If either of them got a wakeup that was missed because + * TASK_FROZEN, then their canonical state reflects that and the below will + * refuse to restore the special state and instead issue the wakeup. + */ +static int __set_task_special(struct task_struct *p, void *arg) +{ + unsigned int state = 0; + + if (p->jobctl & JOBCTL_TRACED) + state = TASK_TRACED; + + else if (p->jobctl & JOBCTL_STOPPED) + state = TASK_STOPPED; + + if (state) + WRITE_ONCE(p->__state, state); + + return state; +} + void __thaw_task(struct task_struct *p) { - unsigned long flags; + unsigned long flags, flags2; spin_lock_irqsave(&freezer_lock, flags); - if (frozen(p)) - wake_up_process(p); + if (WARN_ON_ONCE(freezing(p))) + goto unlock; + + if (lock_task_sighand(p, &flags2)) { + /* TASK_FROZEN -> TASK_{STOPPED,TRACED} */ + bool ret = task_call_func(p, __set_task_special, NULL); + unlock_task_sighand(p, &flags2); + if (ret) + goto unlock; + } + + wake_up_state(p, TASK_FROZEN); +unlock: spin_unlock_irqrestore(&freezer_lock, flags); } diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c index 4ce0923f1ce3..ba01b9408203 100644 --- a/kernel/futex/waitwake.c +++ b/kernel/futex/waitwake.c @@ -334,7 +334,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ @@ -352,7 +352,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) - freezable_schedule(); + schedule(); } __set_current_state(TASK_RUNNING); } @@ -430,7 +430,7 @@ retry: return ret; } - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; @@ -504,7 +504,7 @@ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, return; } - freezable_schedule(); + schedule(); } /** diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh index 0c78e64f747d..473036b43c83 100755 --- a/kernel/gen_kheaders.sh +++ b/kernel/gen_kheaders.sh @@ -31,8 +31,8 @@ if [ "$building_out_of_srctree" ]; then fi all_dirs="$all_dirs $dir_list" -# include/generated/compile.h is ignored because it is touched even when none -# of the source files changed. +# include/generated/utsversion.h is ignored because it is generated after this +# script is executed. (utsversion.h is unneeded for kheaders) # # When Kconfig regenerates include/generated/autoconf.h, its timestamp is # updated, but the contents might be still the same. When any CONFIG option is @@ -42,7 +42,7 @@ all_dirs="$all_dirs $dir_list" # # Ignore them for md5 calculation to avoid pointless regeneration. headers_md5="$(find $all_dirs -name "*.h" | - grep -v "include/generated/compile.h" | + grep -v "include/generated/utsversion.h" | grep -v "include/generated/autoconf.h" | xargs ls -l | md5sum | cut -d ' ' -f1)" diff --git a/kernel/hung_task.c b/kernel/hung_task.c index bb2354f73ded..c71889f3f3fc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -95,8 +95,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ensure the task is not frozen. * Also, skip vfork and any other user process that freezer should skip. */ - if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) - return; + if (unlikely(READ_ONCE(t->__state) & TASK_FROZEN)) + return; /* * When a freshly created task is scheduled once, changes its state to @@ -191,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { + unsigned int state; + if (!max_count--) goto unlock; if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { @@ -198,8 +200,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) goto unlock; last_break = jiffies; } - /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ - if (READ_ONCE(t->__state) == TASK_UNINTERRUPTIBLE) + /* + * skip the TASK_KILLABLE tasks -- these can be killed + * skip the TASK_IDLE tasks -- those are genuinely idle + */ + state = READ_ONCE(t->__state); + if ((state & TASK_UNINTERRUPTIBLE) && + !(state & TASK_WAKEKILL) && + !(state & TASK_NOLOAD)) check_hung_task(t, timeout); } unlock: diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 5db0230aa6b5..a91f9001103c 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -705,6 +705,30 @@ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); + /** + * generic_handle_irq_safe - Invoke the handler for a HW irq belonging + * to a domain from any context. + * @domain: The domain where to perform the lookup + * @hwirq: The HW irq number to convert to a logical one + * + * Returns: 0 on success, a negative value on error. + * + * This function can be called from any context (IRQ or process + * context). If the interrupt is marked as 'enforce IRQ-context only' then + * the function must be invoked from hard interrupt context. + */ +int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); + local_irq_restore(flags); + return ret; +} +EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); + /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. diff --git a/kernel/kcov.c b/kernel/kcov.c index e19c84b02452..e5cd09fd8a05 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include <linux/fs.h> #include <linux/hashtable.h> #include <linux/init.h> +#include <linux/kmsan-checks.h> #include <linux/mm.h> #include <linux/preempt.h> #include <linux/printk.h> @@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); + /* + * KMSAN doesn't instrument this file, so it may not know area->list + * is initialized. Unpoison it explicitly to avoid reports in + * kcov_remote_area_get(). + */ + kmsan_unpoison_memory(&area->list, sizeof(area->list)); } static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index 75712959c84e..00cdf8fa5693 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -26,7 +26,7 @@ static bool __init test_requires(void) { /* random should be initialized for the below tests */ - return prandom_u32() + prandom_u32() != 0; + return get_random_u32() + get_random_u32() != 0; } /* @@ -46,7 +46,7 @@ static bool __init test_encode_decode(void) unsigned long addr; size_t verif_size; - prandom_bytes(&addr, sizeof(addr)); + get_random_bytes(&addr, sizeof(addr)); if (addr < PAGE_SIZE) addr = PAGE_SIZE; diff --git a/kernel/kexec.c b/kernel/kexec.c index b5e40f069768..cb8e6e6f983c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -93,13 +93,10 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments, /* * Because we write directly to the reserved memory region when loading - * crash kernels we need a mutex here to prevent multiple crash kernels - * from attempting to load simultaneously, and to prevent a crash kernel - * from loading over the top of a in use crash kernel. - * - * KISS: always take the mutex. + * crash kernels we need a serialization here to prevent multiple crash + * kernels from attempting to load simultaneously. */ - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (flags & KEXEC_ON_CRASH) { @@ -165,7 +162,7 @@ out: kimage_free(image); out_unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index acd029b307e4..ca2743f9c634 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -46,7 +46,7 @@ #include <crypto/hash.h> #include "kexec_internal.h" -DEFINE_MUTEX(kexec_mutex); +atomic_t __kexec_lock = ATOMIC_INIT(0); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -809,7 +809,7 @@ static int kimage_load_normal_segment(struct kimage *image, if (result < 0) goto out; - ptr = kmap(page); + ptr = kmap_local_page(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; @@ -822,7 +822,7 @@ static int kimage_load_normal_segment(struct kimage *image, memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); - kunmap(page); + kunmap_local(ptr); if (result) { result = -EFAULT; goto out; @@ -873,7 +873,7 @@ static int kimage_load_crash_segment(struct kimage *image, goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); - ptr = kmap(page); + ptr = kmap_local_page(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); @@ -889,7 +889,7 @@ static int kimage_load_crash_segment(struct kimage *image, else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); - kunmap(page); + kunmap_local(ptr); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; @@ -959,7 +959,7 @@ late_initcall(kexec_core_sysctl_init); */ void __noclone __crash_kexec(struct pt_regs *regs) { - /* Take the kexec_mutex here to prevent sys_kexec_load + /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -967,7 +967,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - if (mutex_trylock(&kexec_mutex)) { + if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs; @@ -976,7 +976,7 @@ void __noclone __crash_kexec(struct pt_regs *regs) machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } - mutex_unlock(&kexec_mutex); + kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); @@ -1004,14 +1004,17 @@ void crash_kexec(struct pt_regs *regs) } } -size_t crash_get_memory_size(void) +ssize_t crash_get_memory_size(void) { - size_t size = 0; + ssize_t size = 0; + + if (!kexec_trylock()) + return -EBUSY; - mutex_lock(&kexec_mutex); if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); - mutex_unlock(&kexec_mutex); + + kexec_unlock(); return size; } @@ -1022,7 +1025,8 @@ int crash_shrink_memory(unsigned long new_size) unsigned long old_size; struct resource *ram_res; - mutex_lock(&kexec_mutex); + if (!kexec_trylock()) + return -EBUSY; if (kexec_crash_image) { ret = -ENOENT; @@ -1060,7 +1064,7 @@ int crash_shrink_memory(unsigned long new_size) insert_resource(&iomem_resource, ram_res); unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return ret; } @@ -1132,7 +1136,7 @@ int kernel_kexec(void) { int error = 0; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; @@ -1208,6 +1212,6 @@ int kernel_kexec(void) #endif Unlock: - mutex_unlock(&kexec_mutex); + kexec_unlock(); return error; } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 1d546dc97c50..45637511e0de 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -339,7 +339,7 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, image = NULL; - if (!mutex_trylock(&kexec_mutex)) + if (!kexec_trylock()) return -EBUSY; dest_image = &kexec_image; @@ -411,7 +411,7 @@ out: if ((flags & KEXEC_FILE_ON_CRASH) && kexec_crash_image) arch_kexec_protect_crashkres(); - mutex_unlock(&kexec_mutex); + kexec_unlock(); kimage_free(image); return ret; } diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index 48aaf2ac0d0d..74da1409cd14 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -13,7 +13,20 @@ void kimage_terminate(struct kimage *image); int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end); -extern struct mutex kexec_mutex; +/* + * Whatever is used to serialize accesses to the kexec_crash_image needs to be + * NMI safe, as __crash_kexec() can happen during nmi_panic(), so here we use a + * "simple" atomic variable that is acquired with a cmpxchg(). + */ +extern atomic_t __kexec_lock; +static inline bool kexec_trylock(void) +{ + return atomic_cmpxchg_acquire(&__kexec_lock, 0, 1) == 0; +} +static inline void kexec_unlock(void) +{ + atomic_set_release(&__kexec_lock, 0); +} #ifdef CONFIG_KEXEC_FILE #include <linux/purgatory.h> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index b1292a57c2a5..65dba9076f31 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -105,7 +105,12 @@ KERNEL_ATTR_RO(kexec_crash_loaded); static ssize_t kexec_crash_size_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%zu\n", crash_get_memory_size()); + ssize_t size = crash_get_memory_size(); + + if (size < 0) + return size; + + return sprintf(buf, "%zd\n", size); } static ssize_t kexec_crash_size_store(struct kobject *kobj, struct kobj_attribute *attr, diff --git a/kernel/kthread.c b/kernel/kthread.c index 28a6b7ab4a0f..f97fd01a2932 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -704,6 +704,7 @@ int kthread_stop(struct task_struct *k) kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); + set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 76166df011a4..781249098cb6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -112,7 +112,7 @@ static void __sched account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) { - int firstnonnull = MAXLR + 1; + int firstnonnull = MAXLR; int i; /* skip kernel threads for now */ @@ -150,7 +150,7 @@ account_global_scheduler_latency(struct task_struct *tsk, } i = firstnonnull; - if (i >= MAXLR - 1) + if (i >= MAXLR) return; /* Allocted a new one: */ diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ec06ce59d728..9ada0bc5247b 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -325,6 +325,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * /sys/kernel/livepatch/<patch>/transition * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> + * /sys/kernel/livepatch/<patch>/<object>/patched * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ static int __klp_disable_patch(struct klp_patch *patch); @@ -431,6 +432,22 @@ static struct attribute *klp_patch_attrs[] = { }; ATTRIBUTE_GROUPS(klp_patch); +static ssize_t patched_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_object *obj; + + obj = container_of(kobj, struct klp_object, kobj); + return sysfs_emit(buf, "%d\n", obj->patched); +} + +static struct kobj_attribute patched_kobj_attr = __ATTR_RO(patched); +static struct attribute *klp_object_attrs[] = { + &patched_kobj_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(klp_object); + static void klp_free_object_dynamic(struct klp_object *obj) { kfree(obj->name); @@ -576,6 +593,7 @@ static void klp_kobj_release_object(struct kobject *kobj) static struct kobj_type klp_ktype_object = { .release = klp_kobj_release_object, .sysfs_ops = &kobj_sysfs_ops, + .default_groups = klp_object_groups, }; static void klp_kobj_release_func(struct kobject *kobj) @@ -1171,7 +1189,7 @@ int klp_module_coming(struct module *mod) return -EINVAL; if (!strcmp(mod->name, "vmlinux")) { - pr_err("vmlinux.ko: invalid module name"); + pr_err("vmlinux.ko: invalid module name\n"); return -EINVAL; } diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 5d03a2ad1066..30187b1d8275 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -610,9 +610,23 @@ void klp_reverse_transition(void) /* Called from copy_process() during fork */ void klp_copy_process(struct task_struct *child) { - child->patch_state = current->patch_state; - /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ + /* + * The parent process may have gone through a KLP transition since + * the thread flag was copied in setup_thread_stack earlier. Bring + * the task flag up to date with the parent here. + * + * The operation is serialized against all klp_*_transition() + * operations by the tasklist_lock. The only exception is + * klp_update_patch_state(current), but we cannot race with + * that because we are current. + */ + if (test_tsk_thread_flag(current, TIF_PATCH_PENDING)) + set_tsk_thread_flag(child, TIF_PATCH_PENDING); + else + clear_tsk_thread_flag(child, TIF_PATCH_PENDING); + + child->patch_state = current->patch_state; } /* diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f38..ea925731fa40 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n +KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 64a13eb56078..e3375bc40dad 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -934,8 +934,10 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) * Huh! same key, different name? Did someone trample * on some memory? We're most confused. */ - WARN_ON_ONCE(class->name != lock->name && - lock->key != &__lockdep_no_validate__); + WARN_ONCE(class->name != lock->name && + lock->key != &__lockdep_no_validate__, + "Looking for class \"%s\" with key %ps, but found a different class \"%s\" with the same key\n", + lock->name, lock->key, class->name); return class; } } diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 5fe4c5495ba3..185bd1c906b0 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -192,6 +192,12 @@ EXPORT_SYMBOL_GPL(__percpu_down_read); __sum; \ }) +bool percpu_is_read_locked(struct percpu_rw_semaphore *sem) +{ + return per_cpu_sum(*sem->read_count) != 0 && !atomic_read(&sem->block); +} +EXPORT_SYMBOL_GPL(percpu_is_read_locked); + /* * Return true if the modular sum of the sem->read_count per-CPU variable is * zero. If this sum is zero, then it is stable due to the fact that if any diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 2e1600906c9f..d2ef312a8611 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -18,7 +18,7 @@ * queued_read_lock_slowpath - acquire read lock of a queued rwlock * @lock: Pointer to queued rwlock structure */ -void queued_read_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_read_lock_slowpath(struct qrwlock *lock) { /* * Readers come here when they cannot get the lock without waiting @@ -63,7 +63,7 @@ EXPORT_SYMBOL(queued_read_lock_slowpath); * queued_write_lock_slowpath - acquire write lock of a queued rwlock * @lock : Pointer to queued rwlock structure */ -void queued_write_lock_slowpath(struct qrwlock *lock) +void __lockfunc queued_write_lock_slowpath(struct qrwlock *lock) { int cnts; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 65a9a10caa6f..2b23378775fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -313,7 +313,7 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, * contended : (*,x,y) +--> (*,0,0) ---> (*,0,1) -' : * queue : ^--' : */ -void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { struct mcs_spinlock *prev, *next, *node; u32 old, tail; diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index e84d21aa0722..6afc249ce697 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -489,7 +489,7 @@ gotlock: * PV versions of the unlock fastpath and slowpath functions to be used * instead of queued_spin_unlock(). */ -__visible void +__visible __lockfunc void __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct pv_node *node; @@ -544,7 +544,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) #include <asm/qspinlock_paravirt.h> #ifndef __pv_queued_spin_unlock -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible __lockfunc void __pv_queued_spin_unlock(struct qspinlock *lock) { u8 locked; diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 65f0262f635e..44873594de03 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -133,14 +133,19 @@ * the owner value concurrently without lock. Read from owner, however, * may not need READ_ONCE() as long as the pointer value is only used * for comparison and isn't being dereferenced. + * + * Both rwsem_{set,clear}_owner() functions should be in the same + * preempt disable section as the atomic op that changes sem->count. */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, (long)current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { + lockdep_assert_preemption_disabled(); atomic_long_set(&sem->owner, 0); } @@ -251,13 +256,16 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) static inline bool rwsem_write_trylock(struct rw_semaphore *sem) { long tmp = RWSEM_UNLOCKED_VALUE; + bool ret = false; + preempt_disable(); if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - return true; + ret = true; } - return false; + preempt_enable(); + return ret; } /* @@ -1352,8 +1360,10 @@ static inline void __up_write(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); + preempt_disable(); rwsem_clear_owner(sem); tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); + preempt_enable(); if (unlikely(tmp & RWSEM_FLAG_WAITERS)) rwsem_wake(sem); } diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index f2654d2fe43a..34bfae72f295 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c @@ -51,7 +51,7 @@ static noinline void __up(struct semaphore *sem); * Use of this function is deprecated, please use down_interruptible() or * down_killable() instead. */ -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { unsigned long flags; @@ -74,7 +74,7 @@ EXPORT_SYMBOL(down); * If the sleep is interrupted by a signal, this function will return -EINTR. * If the semaphore is successfully acquired, this function returns 0. */ -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -101,7 +101,7 @@ EXPORT_SYMBOL(down_interruptible); * -EINTR. If the semaphore is successfully acquired, this function returns * 0. */ -int down_killable(struct semaphore *sem) +int __sched down_killable(struct semaphore *sem) { unsigned long flags; int result = 0; @@ -131,7 +131,7 @@ EXPORT_SYMBOL(down_killable); * Unlike mutex_trylock, this function can be used from interrupt context, * and the semaphore can be released by any task or interrupt. */ -int down_trylock(struct semaphore *sem) +int __sched down_trylock(struct semaphore *sem) { unsigned long flags; int count; @@ -156,7 +156,7 @@ EXPORT_SYMBOL(down_trylock); * If the semaphore is not released within the specified number of jiffies, * this function returns -ETIME. It returns 0 if the semaphore was acquired. */ -int down_timeout(struct semaphore *sem, long timeout) +int __sched down_timeout(struct semaphore *sem, long timeout) { unsigned long flags; int result = 0; @@ -180,7 +180,7 @@ EXPORT_SYMBOL(down_timeout); * Release the semaphore. Unlike mutexes, up() may be called from any * context and even by tasks which have never called down(). */ -void up(struct semaphore *sem) +void __sched up(struct semaphore *sem) { unsigned long flags; diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 7f49baaa4979..8475a0794f8c 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -133,7 +133,7 @@ BUILD_LOCK_OPS(write, rwlock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK -int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) { return __raw_spin_trylock(lock); } @@ -141,7 +141,7 @@ EXPORT_SYMBOL(_raw_spin_trylock); #endif #ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH -int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +noinline int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) { return __raw_spin_trylock_bh(lock); } @@ -149,7 +149,7 @@ EXPORT_SYMBOL(_raw_spin_trylock_bh); #endif #ifndef CONFIG_INLINE_SPIN_LOCK -void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) { __raw_spin_lock(lock); } @@ -157,7 +157,7 @@ EXPORT_SYMBOL(_raw_spin_lock); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE -unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +noinline unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) { return __raw_spin_lock_irqsave(lock); } @@ -165,7 +165,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irqsave); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_IRQ -void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) { __raw_spin_lock_irq(lock); } @@ -173,7 +173,7 @@ EXPORT_SYMBOL(_raw_spin_lock_irq); #endif #ifndef CONFIG_INLINE_SPIN_LOCK_BH -void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) { __raw_spin_lock_bh(lock); } @@ -181,7 +181,7 @@ EXPORT_SYMBOL(_raw_spin_lock_bh); #endif #ifdef CONFIG_UNINLINE_SPIN_UNLOCK -void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) { __raw_spin_unlock(lock); } @@ -189,7 +189,7 @@ EXPORT_SYMBOL(_raw_spin_unlock); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE -void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) { __raw_spin_unlock_irqrestore(lock, flags); } @@ -197,7 +197,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ -void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) { __raw_spin_unlock_irq(lock); } @@ -205,7 +205,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_irq); #endif #ifndef CONFIG_INLINE_SPIN_UNLOCK_BH -void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +noinline void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) { __raw_spin_unlock_bh(lock); } @@ -215,7 +215,7 @@ EXPORT_SYMBOL(_raw_spin_unlock_bh); #ifndef CONFIG_PREEMPT_RT #ifndef CONFIG_INLINE_READ_TRYLOCK -int __lockfunc _raw_read_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_read_trylock(rwlock_t *lock) { return __raw_read_trylock(lock); } @@ -223,7 +223,7 @@ EXPORT_SYMBOL(_raw_read_trylock); #endif #ifndef CONFIG_INLINE_READ_LOCK -void __lockfunc _raw_read_lock(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock(rwlock_t *lock) { __raw_read_lock(lock); } @@ -231,7 +231,7 @@ EXPORT_SYMBOL(_raw_read_lock); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE -unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) { return __raw_read_lock_irqsave(lock); } @@ -239,7 +239,7 @@ EXPORT_SYMBOL(_raw_read_lock_irqsave); #endif #ifndef CONFIG_INLINE_READ_LOCK_IRQ -void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_irq(rwlock_t *lock) { __raw_read_lock_irq(lock); } @@ -247,7 +247,7 @@ EXPORT_SYMBOL(_raw_read_lock_irq); #endif #ifndef CONFIG_INLINE_READ_LOCK_BH -void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_lock_bh(rwlock_t *lock) { __raw_read_lock_bh(lock); } @@ -255,7 +255,7 @@ EXPORT_SYMBOL(_raw_read_lock_bh); #endif #ifndef CONFIG_INLINE_READ_UNLOCK -void __lockfunc _raw_read_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock(rwlock_t *lock) { __raw_read_unlock(lock); } @@ -263,7 +263,7 @@ EXPORT_SYMBOL(_raw_read_unlock); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE -void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_read_unlock_irqrestore(lock, flags); } @@ -271,7 +271,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_IRQ -void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) { __raw_read_unlock_irq(lock); } @@ -279,7 +279,7 @@ EXPORT_SYMBOL(_raw_read_unlock_irq); #endif #ifndef CONFIG_INLINE_READ_UNLOCK_BH -void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) { __raw_read_unlock_bh(lock); } @@ -287,7 +287,7 @@ EXPORT_SYMBOL(_raw_read_unlock_bh); #endif #ifndef CONFIG_INLINE_WRITE_TRYLOCK -int __lockfunc _raw_write_trylock(rwlock_t *lock) +noinline int __lockfunc _raw_write_trylock(rwlock_t *lock) { return __raw_write_trylock(lock); } @@ -295,7 +295,7 @@ EXPORT_SYMBOL(_raw_write_trylock); #endif #ifndef CONFIG_INLINE_WRITE_LOCK -void __lockfunc _raw_write_lock(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock(rwlock_t *lock) { __raw_write_lock(lock); } @@ -313,7 +313,7 @@ EXPORT_SYMBOL(_raw_write_lock_nested); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE -unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +noinline unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) { return __raw_write_lock_irqsave(lock); } @@ -321,7 +321,7 @@ EXPORT_SYMBOL(_raw_write_lock_irqsave); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_IRQ -void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_irq(rwlock_t *lock) { __raw_write_lock_irq(lock); } @@ -329,7 +329,7 @@ EXPORT_SYMBOL(_raw_write_lock_irq); #endif #ifndef CONFIG_INLINE_WRITE_LOCK_BH -void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_lock_bh(rwlock_t *lock) { __raw_write_lock_bh(lock); } @@ -337,7 +337,7 @@ EXPORT_SYMBOL(_raw_write_lock_bh); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK -void __lockfunc _raw_write_unlock(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock(rwlock_t *lock) { __raw_write_unlock(lock); } @@ -345,7 +345,7 @@ EXPORT_SYMBOL(_raw_write_unlock); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE -void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +noinline void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) { __raw_write_unlock_irqrestore(lock, flags); } @@ -353,7 +353,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irqrestore); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ -void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) { __raw_write_unlock_irq(lock); } @@ -361,7 +361,7 @@ EXPORT_SYMBOL(_raw_write_unlock_irq); #endif #ifndef CONFIG_INLINE_WRITE_UNLOCK_BH -void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +noinline void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) { __raw_write_unlock_bh(lock); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 353004155d65..43efb2a04160 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -399,7 +399,7 @@ static int *get_random_order(int count) order[n] = n; for (n = count - 1; n > 1; n--) { - r = get_random_int() % (n + 1); + r = prandom_u32_max(n + 1); if (r != n) { tmp = order[n]; order[n] = order[r]; @@ -538,7 +538,7 @@ static void stress_one_work(struct work_struct *work) { struct stress *stress = container_of(work, typeof(*stress), work); const int nlocks = stress->nlocks; - struct ww_mutex *lock = stress->locks + (get_random_int() % nlocks); + struct ww_mutex *lock = stress->locks + prandom_u32_max(nlocks); int err; do { diff --git a/kernel/module/decompress.c b/kernel/module/decompress.c index 4d0bcb3d9e44..c033572d83f0 100644 --- a/kernel/module/decompress.c +++ b/kernel/module/decompress.c @@ -256,7 +256,7 @@ void module_decompress_cleanup(struct load_info *info) static ssize_t compression_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%s\n", __stringify(MODULE_COMPRESSION)); + return sysfs_emit(buf, __stringify(MODULE_COMPRESSION) "\n"); } static struct kobj_attribute module_compression_attr = __ATTR_RO(compression); diff --git a/kernel/module/internal.h b/kernel/module/internal.h index 680d980a4fb2..2e2bf236f558 100644 --- a/kernel/module/internal.h +++ b/kernel/module/internal.h @@ -53,6 +53,7 @@ extern const struct kernel_symbol __stop___ksymtab_gpl[]; extern const s32 __start___kcrctab[]; extern const s32 __start___kcrctab_gpl[]; +#include <linux/dynamic_debug.h> struct load_info { const char *name; /* pointer to module in temporary copy, freed at end of load_module() */ @@ -62,8 +63,7 @@ struct load_info { Elf_Shdr *sechdrs; char *secstrings, *strtab; unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; - struct _ddebug *debug; - unsigned int num_debug; + struct _ddebug_info dyndbg; bool sig_ok; #ifdef CONFIG_KALLSYMS unsigned long mod_kallsyms_init_off; diff --git a/kernel/module/main.c b/kernel/module/main.c index 70c0b2c6fef8..d02d39c7174e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1594,16 +1594,16 @@ static void free_modinfo(struct module *mod) } } -static void dynamic_debug_setup(struct module *mod, struct _ddebug *debug, unsigned int num) +static void dynamic_debug_setup(struct module *mod, struct _ddebug_info *dyndbg) { - if (!debug) + if (!dyndbg->num_descs) return; - ddebug_add_module(debug, num, mod->name); + ddebug_add_module(dyndbg, mod->name); } -static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) +static void dynamic_debug_remove(struct module *mod, struct _ddebug_info *dyndbg) { - if (debug) + if (dyndbg->num_descs) ddebug_remove_module(mod->name); } @@ -2107,8 +2107,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) if (section_addr(info, "__obsparm")) pr_warn("%s: Ignoring obsolete parameters\n", mod->name); - info->debug = section_objs(info, "__dyndbg", - sizeof(*info->debug), &info->num_debug); + info->dyndbg.descs = section_objs(info, "__dyndbg", + sizeof(*info->dyndbg.descs), &info->dyndbg.num_descs); + info->dyndbg.classes = section_objs(info, "__dyndbg_classes", + sizeof(*info->dyndbg.classes), &info->dyndbg.num_classes); return 0; } @@ -2799,7 +2801,7 @@ static int load_module(struct load_info *info, const char __user *uargs, } init_build_id(mod, info); - dynamic_debug_setup(mod, info->debug, info->num_debug); + dynamic_debug_setup(mod, &info->dyndbg); /* Ftrace init must be called in the MODULE_STATE_UNFORMED state */ ftrace_module_init(mod); @@ -2863,7 +2865,7 @@ static int load_module(struct load_info *info, const char __user *uargs, ddebug_cleanup: ftrace_release_mod(mod); - dynamic_debug_remove(mod, info->debug); + dynamic_debug_remove(mod, &info->dyndbg); synchronize_rcu(); kfree(mod->args); free_arch_cleanup: diff --git a/kernel/module/tracking.c b/kernel/module/tracking.c index 7f8133044d09..26d812e07615 100644 --- a/kernel/module/tracking.c +++ b/kernel/module/tracking.c @@ -10,6 +10,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/list.h> +#include <linux/debugfs.h> #include <linux/rculist.h> #include "internal.h" @@ -21,6 +22,9 @@ int try_add_tainted_module(struct module *mod) module_assert_mutex_or_preempt(); + if (!mod->taints) + goto out; + list_for_each_entry_rcu(mod_taint, &unloaded_tainted_modules, list, lockdep_is_held(&module_mutex)) { if (!strcmp(mod_taint->name, mod->name) && @@ -59,3 +63,70 @@ void print_unloaded_tainted_modules(void) } } } + +#ifdef CONFIG_DEBUG_FS +static void *unloaded_tainted_modules_seq_start(struct seq_file *m, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + return seq_list_start_rcu(&unloaded_tainted_modules, *pos); +} + +static void *unloaded_tainted_modules_seq_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next_rcu(p, &unloaded_tainted_modules, pos); +} + +static void unloaded_tainted_modules_seq_stop(struct seq_file *m, void *p) + __releases(rcu) +{ + rcu_read_unlock(); +} + +static int unloaded_tainted_modules_seq_show(struct seq_file *m, void *p) +{ + struct mod_unload_taint *mod_taint; + char buf[MODULE_FLAGS_BUF_SIZE]; + size_t l; + + mod_taint = list_entry(p, struct mod_unload_taint, list); + l = module_flags_taint(mod_taint->taints, buf); + buf[l++] = '\0'; + + seq_printf(m, "%s (%s) %llu", mod_taint->name, buf, mod_taint->count); + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations unloaded_tainted_modules_seq_ops = { + .start = unloaded_tainted_modules_seq_start, + .next = unloaded_tainted_modules_seq_next, + .stop = unloaded_tainted_modules_seq_stop, + .show = unloaded_tainted_modules_seq_show, +}; + +static int unloaded_tainted_modules_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unloaded_tainted_modules_seq_ops); +} + +static const struct file_operations unloaded_tainted_modules_fops = { + .open = unloaded_tainted_modules_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init unloaded_tainted_modules_init(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("modules", NULL); + debugfs_create_file("unloaded_tainted", 0444, dir, NULL, + &unloaded_tainted_modules_fops); + + return 0; +} +module_init(unloaded_tainted_modules_init); +#endif /* CONFIG_DEBUG_FS */ diff --git a/kernel/panic.c b/kernel/panic.c index c6eb8f8db0c0..da323209f583 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -329,9 +329,6 @@ void panic(const char *fmt, ...) if (_crash_kexec_post_notifiers) __crash_kexec(NULL); -#ifdef CONFIG_VT - unblank_screen(); -#endif console_unblank(); /* diff --git a/kernel/pid.c b/kernel/pid.c index 2fc0a16ec77b..3fbc5e46b721 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -519,6 +519,7 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } +EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 89c71fce225d..f58a0aa92310 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -92,20 +92,24 @@ bool hibernation_available(void) */ void hibernation_set_ops(const struct platform_hibernation_ops *ops) { + unsigned int sleep_flags; + if (ops && !(ops->begin && ops->end && ops->pre_snapshot && ops->prepare && ops->finish && ops->enter && ops->pre_restore && ops->restore_cleanup && ops->leave)) { WARN_ON(1); return; } - lock_system_sleep(); + + sleep_flags = lock_system_sleep(); + hibernation_ops = ops; if (ops) hibernation_mode = HIBERNATION_PLATFORM; else if (hibernation_mode == HIBERNATION_PLATFORM) hibernation_mode = HIBERNATION_SHUTDOWN; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(hibernation_set_ops); @@ -713,6 +717,7 @@ static int load_image_and_restore(void) int hibernate(void) { bool snapshot_test = false; + unsigned int sleep_flags; int error; if (!hibernation_available()) { @@ -720,7 +725,7 @@ int hibernate(void) return -EPERM; } - lock_system_sleep(); + sleep_flags = lock_system_sleep(); /* The snapshot device should not be opened while we're running */ if (!hibernate_acquire()) { error = -EBUSY; @@ -794,7 +799,7 @@ int hibernate(void) pm_restore_console(); hibernate_release(); Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); pr_info("hibernation exit\n"); return error; @@ -809,9 +814,10 @@ int hibernate(void) */ int hibernate_quiet_exec(int (*func)(void *data), void *data) { + unsigned int sleep_flags; int error; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -891,7 +897,7 @@ restore: hibernate_release(); unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -1100,11 +1106,12 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + int mode = HIBERNATION_INVALID; + unsigned int sleep_flags; int error = 0; - int i; int len; char *p; - int mode = HIBERNATION_INVALID; + int i; if (!hibernation_available()) return -EPERM; @@ -1112,7 +1119,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { if (len == strlen(hibernation_modes[i]) && !strncmp(buf, hibernation_modes[i], len)) { @@ -1142,7 +1149,7 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, if (!error) pm_pr_dbg("Hibernation mode set to '%s'\n", hibernation_modes[mode]); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } @@ -1158,9 +1165,10 @@ static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { - dev_t res; + unsigned int sleep_flags; int len = n; char *name; + dev_t res; if (len && buf[len-1] == '\n') len--; @@ -1173,9 +1181,10 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, if (!res) return -EINVAL; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_resume_device = res; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); + pm_pr_dbg("Configured hibernation resume from disk to %u\n", swsusp_resume_device); noresume = 0; diff --git a/kernel/power/main.c b/kernel/power/main.c index e3694034b753..31ec4a9b9d70 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -21,14 +21,16 @@ #ifdef CONFIG_PM_SLEEP -void lock_system_sleep(void) +unsigned int lock_system_sleep(void) { - current->flags |= PF_FREEZER_SKIP; + unsigned int flags = current->flags; + current->flags |= PF_NOFREEZE; mutex_lock(&system_transition_mutex); + return flags; } EXPORT_SYMBOL_GPL(lock_system_sleep); -void unlock_system_sleep(void) +void unlock_system_sleep(unsigned int flags) { /* * Don't use freezer_count() because we don't want the call to @@ -46,7 +48,8 @@ void unlock_system_sleep(void) * Which means, if we use try_to_freeze() here, it would make them * enter the refrigerator, thus causing hibernation to lockup. */ - current->flags &= ~PF_FREEZER_SKIP; + if (!(flags & PF_NOFREEZE)) + current->flags &= ~PF_NOFREEZE; mutex_unlock(&system_transition_mutex); } EXPORT_SYMBOL_GPL(unlock_system_sleep); @@ -263,16 +266,17 @@ static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t n) { + unsigned int sleep_flags; const char * const *s; + int error = -EINVAL; int level; char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); level = TEST_FIRST; for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) @@ -282,7 +286,7 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, break; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error ? error : n; } diff --git a/kernel/power/process.c b/kernel/power/process.c index 3068601e585a..ddd9988327fe 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -50,8 +50,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - if (!freezer_should_skip(p)) - todo++; + todo++; } read_unlock(&tasklist_lock); @@ -96,8 +95,7 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup || pm_debug_messages_on) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) - && freezing(p) && !frozen(p)) + if (p != current && freezing(p) && !frozen(p)) sched_show_task(p); } read_unlock(&tasklist_lock); @@ -129,7 +127,7 @@ int freeze_processes(void) current->flags |= PF_SUSPEND_TASK; if (!pm_freezing) - atomic_inc(&system_freezing_cnt); + static_branch_inc(&freezer_active); pm_wakeup_clear(0); pr_info("Freezing user space processes ... "); @@ -190,7 +188,7 @@ void thaw_processes(void) trace_suspend_resume(TPS("thaw_processes"), 0, true); if (pm_freezing) - atomic_dec(&system_freezing_cnt); + static_branch_dec(&freezer_active); pm_freezing = false; pm_nosig_freezing = false; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 827075944d28..fa3bf161d13f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -75,9 +75,11 @@ EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); void s2idle_set_ops(const struct platform_s2idle_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); s2idle_ops = ops; - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } static void s2idle_begin(void) @@ -136,6 +138,9 @@ static void s2idle_loop(void) break; } + if (s2idle_ops && s2idle_ops->check) + s2idle_ops->check(); + s2idle_enter(); } @@ -200,7 +205,9 @@ __setup("mem_sleep_default=", mem_sleep_default_setup); */ void suspend_set_ops(const struct platform_suspend_ops *ops) { - lock_system_sleep(); + unsigned int sleep_flags; + + sleep_flags = lock_system_sleep(); suspend_ops = ops; @@ -216,7 +223,7 @@ void suspend_set_ops(const struct platform_suspend_ops *ops) mem_sleep_current = PM_SUSPEND_MEM; } - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); } EXPORT_SYMBOL_GPL(suspend_set_ops); diff --git a/kernel/power/user.c b/kernel/power/user.c index d43c2aa583b2..3a4e70366f35 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -47,12 +47,13 @@ int is_hibernate_resume_dev(dev_t dev) static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; @@ -98,7 +99,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->dev = 0; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return error; } @@ -106,8 +107,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; + unsigned int sleep_flags; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; @@ -124,7 +126,7 @@ static int snapshot_release(struct inode *inode, struct file *filp) PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return 0; } @@ -132,11 +134,12 @@ static int snapshot_release(struct inode *inode, struct file *filp) static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned int sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { @@ -157,7 +160,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, *offp += res; Unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } @@ -165,16 +168,17 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf, static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { + loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; + unsigned long sleep_flags; ssize_t res; - loff_t pg_offp = *offp & ~PAGE_MASK; if (need_wait) { wait_for_device_probe(); need_wait = false; } - lock_system_sleep(); + sleep_flags = lock_system_sleep(); data = filp->private_data; @@ -196,7 +200,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, if (res > 0) *offp += res; unlock: - unlock_system_sleep(); + unlock_system_sleep(sleep_flags); return res; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index a1a81fd9889b..e4f1e7478b52 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -220,9 +220,6 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, } #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */ -/* Number of registered extended console drivers. */ -static int nr_ext_console_drivers; - /* * Helper macros to handle lockdep when locking/unlocking console_sem. We use * macros instead of functions so that _RET_IP_ contains useful information. @@ -433,7 +430,7 @@ static struct printk_ringbuffer *prb = &printk_rb_static; * per_cpu_areas are initialised. This variable is set to true when * it's safe to access per-CPU data. */ -static bool __printk_percpu_data_ready __read_mostly; +static bool __printk_percpu_data_ready __ro_after_init; bool printk_percpu_data_ready(void) { @@ -2296,6 +2293,7 @@ asmlinkage __visible int _printk(const char *fmt, ...) } EXPORT_SYMBOL(_printk); +static bool pr_flush(int timeout_ms, bool reset_on_progress); static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress); #else /* CONFIG_PRINTK */ @@ -2330,6 +2328,7 @@ static void call_console_driver(struct console *con, const char *text, size_t le { } static bool suppress_message_printing(int level) { return false; } +static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; } static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; } #endif /* CONFIG_PRINTK */ @@ -3186,9 +3185,6 @@ void register_console(struct console *newcon) console_drivers->next = newcon; } - if (newcon->flags & CON_EXTENDED) - nr_ext_console_drivers++; - newcon->dropped = 0; if (newcon->flags & CON_PRINTBUFFER) { /* Get a consistent copy of @syslog_seq. */ @@ -3213,9 +3209,6 @@ void register_console(struct console *newcon) if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { - /* We need to iterate through all boot consoles, to make - * sure we print everything out, before we unregister them. - */ for_each_console(con) if (con->flags & CON_BOOT) unregister_console(con); @@ -3254,9 +3247,6 @@ int unregister_console(struct console *console) if (res) goto out_disable_unlock; - if (console->flags & CON_EXTENDED) - nr_ext_console_drivers--; - /* * If this isn't the last console and it has CON_CONSDEV set, we * need to set it on the next preferred console. @@ -3438,11 +3428,10 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre * Context: Process context. May sleep while acquiring console lock. * Return: true if all enabled printers are caught up. */ -bool pr_flush(int timeout_ms, bool reset_on_progress) +static bool pr_flush(int timeout_ms, bool reset_on_progress) { return __pr_flush(NULL, timeout_ms, reset_on_progress); } -EXPORT_SYMBOL(pr_flush); /* * Delayed printk version, for scheduler-internal messages: diff --git a/kernel/profile.c b/kernel/profile.c index 7ea01ba30e75..8a77769bc4b4 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -59,43 +59,39 @@ int profile_setup(char *str) static const char schedstr[] = "schedule"; static const char sleepstr[] = "sleep"; static const char kvmstr[] = "kvm"; + const char *select = NULL; int par; if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS force_schedstat_enabled(); prof_on = SLEEP_PROFILING; - if (str[strlen(sleepstr)] == ',') - str += strlen(sleepstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel sleep profiling enabled (shift: %u)\n", - prof_shift); + select = sleepstr; #else pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); #endif /* CONFIG_SCHEDSTATS */ } else if (!strncmp(str, schedstr, strlen(schedstr))) { prof_on = SCHED_PROFILING; - if (str[strlen(schedstr)] == ',') - str += strlen(schedstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel schedule profiling enabled (shift: %u)\n", - prof_shift); + select = schedstr; } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { prof_on = KVM_PROFILING; - if (str[strlen(kvmstr)] == ',') - str += strlen(kvmstr) + 1; - if (get_option(&str, &par)) - prof_shift = clamp(par, 0, BITS_PER_LONG - 1); - pr_info("kernel KVM profiling enabled (shift: %u)\n", - prof_shift); + select = kvmstr; } else if (get_option(&str, &par)) { prof_shift = clamp(par, 0, BITS_PER_LONG - 1); prof_on = CPU_PROFILING; pr_info("kernel profiling enabled (shift: %u)\n", prof_shift); } + + if (select) { + if (str[strlen(select)] == ',') + str += strlen(select) + 1; + if (get_option(&str, &par)) + prof_shift = clamp(par, 0, BITS_PER_LONG - 1); + pr_info("kernel %s profiling enabled (shift: %u)\n", + select, prof_shift); + } + return 1; } __setup("profile=", profile_setup); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1893d909e45c..54482193e1ed 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -269,7 +269,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) read_unlock(&tasklist_lock); if (!ret && !ignore_state && - WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED))) + WARN_ON_ONCE(!wait_task_inactive(child, __TASK_TRACED|TASK_FROZEN))) ret = -ESRCH; return ret; diff --git a/kernel/reboot.c b/kernel/reboot.c index 3c35445bf5ad..3bba88c7ffc6 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -243,6 +243,17 @@ void migrate_to_reboot_cpu(void) set_cpus_allowed_ptr(current, cpumask_of(cpu)); } +/* + * Notifier list for kernel code which wants to be called + * to prepare system for restart. + */ +static BLOCKING_NOTIFIER_HEAD(restart_prep_handler_list); + +static void do_kernel_restart_prepare(void) +{ + blocking_notifier_call_chain(&restart_prep_handler_list, 0, NULL); +} + /** * kernel_restart - reboot the system * @cmd: pointer to buffer containing command to execute for restart @@ -254,6 +265,7 @@ void migrate_to_reboot_cpu(void) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + do_kernel_restart_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) @@ -396,6 +408,11 @@ register_sys_off_handler(enum sys_off_mode mode, handler->list = &power_off_handler_list; break; + case SYS_OFF_MODE_RESTART_PREPARE: + handler->list = &restart_prep_handler_list; + handler->blocking = true; + break; + case SYS_OFF_MODE_RESTART: handler->list = &restart_handler_list; break; diff --git a/kernel/relay.c b/kernel/relay.c index 6a611e779e95..d7edc934c56d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,10 +60,7 @@ static const struct vm_operations_struct relay_file_mmap_ops = { */ static struct page **relay_alloc_page_array(unsigned int n_pages) { - const size_t pa_size = n_pages * sizeof(struct page *); - if (pa_size > PAGE_SIZE) - return vzalloc(pa_size); - return kzalloc(pa_size, GFP_KERNEL); + return kvcalloc(n_pages, sizeof(struct page *), GFP_KERNEL); } /* diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 4ebaf97f7bd8..991fc9002535 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -161,7 +161,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) struct task_struct *t; unsigned long flags; - BUG_ON(!lock_task_sighand(p, &flags)); + if (WARN_ON_ONCE(!lock_task_sighand(p, &flags))) + return; prev = p->signal->autogroup; if (prev == ag) { diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 35f15c26ed54..d57a5c1c1cd9 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -204,6 +204,7 @@ EXPORT_SYMBOL(wait_for_completion_io_timeout); int __sched wait_for_completion_interruptible(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) return t; return 0; @@ -241,12 +242,23 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); int __sched wait_for_completion_killable(struct completion *x) { long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) return t; return 0; } EXPORT_SYMBOL(wait_for_completion_killable); +int __sched wait_for_completion_state(struct completion *x, unsigned int state) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, state); + + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_state); + /** * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) * @x: holds the state of this particular completion diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 60fdc0faf1c9..5800b0623ff3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -143,11 +143,7 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ -#ifdef CONFIG_PREEMPT_RT -const_debug unsigned int sysctl_sched_nr_migrate = 8; -#else -const_debug unsigned int sysctl_sched_nr_migrate = 32; -#endif +const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_MIGRATE_BREAK; __read_mostly int scheduler_running; @@ -361,10 +357,7 @@ static void __sched_core_flip(bool enabled) /* * Toggle the offline CPUs. */ - cpumask_copy(&sched_core_mask, cpu_possible_mask); - cpumask_andnot(&sched_core_mask, &sched_core_mask, cpu_online_mask); - - for_each_cpu(cpu, &sched_core_mask) + for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_online_mask) cpu_rq(cpu)->core_enabled = enabled; cpus_read_unlock(); @@ -482,8 +475,7 @@ sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) { } * p->se.load, p->rt_priority, * p->dl.dl_{runtime, deadline, period, flags, bw, density} * - sched_setnuma(): p->numa_preferred_nid - * - sched_move_task()/ - * cpu_cgroup_fork(): p->sched_task_group + * - sched_move_task(): p->sched_task_group * - uclamp_update_active() p->uclamp* * * p->state <- TASK_*: @@ -709,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { @@ -2329,7 +2322,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq = cpu_rq(new_cpu); rq_lock(rq, rf); - BUG_ON(task_cpu(p) != new_cpu); + WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); @@ -2779,7 +2772,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag return -EINVAL; } - if (task_running(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { + if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TASK_WAKING) { /* * MIGRATE_ENABLE gets here because 'p == current', but for * anything else we cannot do is_migration_disabled(), punt @@ -3255,12 +3248,12 @@ out: /* * wait_task_inactive - wait for a thread to unschedule. * - * If @match_state is nonzero, it's the @p->state value just checked and - * not expected to change. If it changes, i.e. @p might have woken up, - * then return zero. When we succeed in waiting for @p to be off its CPU, - * we return a positive number (its total switch count). If a second call - * a short while later returns the same number, the caller can be sure that - * @p has remained unscheduled the whole time. + * Wait for the thread to block in any of the states set in @match_state. + * If it changes, i.e. @p might have woken up, then return zero. When we + * succeed in waiting for @p to be off its CPU, we return a positive number + * (its total switch count). If a second call a short while later returns the + * same number, the caller can be sure that @p has remained unscheduled the + * whole time. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't @@ -3291,12 +3284,12 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state * * NOTE! Since we don't hold any locks, it's not * even sure that "rq" stays as the right runqueue! - * But we don't care, since "task_running()" will + * But we don't care, since "task_on_cpu()" will * return false if the runqueue has changed and p * is actually now running somewhere else! */ - while (task_running(rq, p)) { - if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) + while (task_on_cpu(rq, p)) { + if (!(READ_ONCE(p->__state) & match_state)) return 0; cpu_relax(); } @@ -3308,10 +3301,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state */ rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); - running = task_running(rq, p); + running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || READ_ONCE(p->__state) == match_state) + if (READ_ONCE(p->__state) & match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -4397,6 +4390,17 @@ void set_numabalancing_state(bool enabled) } #ifdef CONFIG_PROC_SYSCTL +static void reset_memory_tiering(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + pgdat->nbp_threshold = 0; + pgdat->nbp_th_nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + pgdat->nbp_th_start = jiffies_to_msecs(jiffies); + } +} + int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -4413,6 +4417,9 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, if (err < 0) return err; if (write) { + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + (state & NUMA_BALANCING_MEMORY_TIERING)) + reset_memory_tiering(); sysctl_numa_balancing_mode = state; __set_numabalancing_state(state); } @@ -5167,6 +5174,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * finish_task_switch()'s mmdrop(). */ switch_mm_irqs_off(prev->active_mm, next->mm, next); + lru_gen_use_mm(next->mm); if (!prev->mm) { // from kernel /* will mmdrop() in finish_task_switch(). */ @@ -6430,7 +6438,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUPTIBLE) && !(prev_state & TASK_NOLOAD) && - !(prev->flags & PF_FROZEN); + !(prev_state & TASK_FROZEN); if (prev->sched_contributes_to_load) rq->nr_uninterruptible++; @@ -8650,7 +8658,7 @@ again: if (curr->sched_class != p->sched_class) goto out_unlock; - if (task_running(p_rq, p) || !task_is_running(p)) + if (task_on_cpu(p_rq, p) || !task_is_running(p)) goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); @@ -8862,7 +8870,7 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", + pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", free, task_pid_nr(p), ppid, read_task_thread_flags(p)); @@ -8890,7 +8898,7 @@ state_filter_match(unsigned long state_filter, struct task_struct *p) * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows * TASK_KILLABLE). */ - if (state_filter == TASK_UNINTERRUPTIBLE && state == TASK_IDLE) + if (state_filter == TASK_UNINTERRUPTIBLE && (state & TASK_NOLOAD)) return false; return true; @@ -9602,9 +9610,6 @@ LIST_HEAD(task_groups); static struct kmem_cache *task_group_cache __read_mostly; #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); -DECLARE_PER_CPU(cpumask_var_t, select_rq_mask); - void __init sched_init(void) { unsigned long ptr = 0; @@ -9648,14 +9653,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } -#ifdef CONFIG_CPUMASK_OFFSTACK - for_each_possible_cpu(i) { - per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - per_cpu(select_rq_mask, i) = (cpumask_var_t)kzalloc_node( - cpumask_size(), GFP_KERNEL, cpu_to_node(i)); - } -#endif /* CONFIG_CPUMASK_OFFSTACK */ init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); @@ -10164,7 +10161,7 @@ void sched_release_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -static void sched_change_group(struct task_struct *tsk, int type) +static void sched_change_group(struct task_struct *tsk) { struct task_group *tg; @@ -10180,7 +10177,7 @@ static void sched_change_group(struct task_struct *tsk, int type) #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) - tsk->sched_class->task_change_group(tsk, type); + tsk->sched_class->task_change_group(tsk); else #endif set_task_rq(tsk, task_cpu(tsk)); @@ -10211,7 +10208,7 @@ void sched_move_task(struct task_struct *tsk) if (running) put_prev_task(rq, tsk); - sched_change_group(tsk, TASK_MOVE_GROUP); + sched_change_group(tsk); if (queued) enqueue_task(rq, tsk, queue_flags); @@ -10289,53 +10286,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_unregister_group(tg); } -/* - * This is called before wake_up_new_task(), therefore we really only - * have to set its group bits, all the other stuff does not apply. - */ -static void cpu_cgroup_fork(struct task_struct *task) -{ - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(task, &rf); - - update_rq_clock(rq); - sched_change_group(task, TASK_SET_GROUP); - - task_rq_unlock(rq, task, &rf); -} - +#ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; - int ret = 0; cgroup_taskset_for_each(task, css, tset) { -#ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; -#endif - /* - * Serialize against wake_up_new_task() such that if it's - * running, we're sure to observe its full state. - */ - raw_spin_lock_irq(&task->pi_lock); - /* - * Avoid calling sched_move_task() before wake_up_new_task() - * has happened. This would lead to problems with PELT, due to - * move wanting to detach+attach while we're not attached yet. - */ - if (READ_ONCE(task->__state) == TASK_NEW) - ret = -EINVAL; - raw_spin_unlock_irq(&task->pi_lock); - - if (ret) - break; } - return ret; + return 0; } +#endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) { @@ -11171,8 +11134,9 @@ struct cgroup_subsys cpu_cgrp_subsys = { .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, - .fork = cpu_cgroup_fork, +#ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, +#endif .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c index 93878cb2a46d..a57fd8f27498 100644 --- a/kernel/sched/core_sched.c +++ b/kernel/sched/core_sched.c @@ -88,7 +88,7 @@ static unsigned long sched_core_update_cookie(struct task_struct *p, * core has now entered/left forced idle state. Defer accounting to the * next scheduling edge, rather than always forcing a reschedule here. */ - if (task_running(rq, p)) + if (task_on_cpu(rq, p)) resched_curr(rq); task_rq_unlock(rq, p, &rf); @@ -205,7 +205,7 @@ int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, default: err = -EINVAL; goto out; - }; + } if (type == PIDTYPE_PID) { __sched_core_set(task, cookie); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 02d970a879ed..57c92d751bcd 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -123,7 +123,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, unsigned long cap, max_cap = 0; int cpu, max_cpu = -1; - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return 1; /* Ensure the capacity of the CPUs fits the task. */ diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index fa9ce9d83683..a286e726eb4b 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -147,7 +147,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, int task_pri = convert_prio(p->prio); int idx, cpu; - BUG_ON(task_pri >= CPUPRI_NR_PRIORITIES); + WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES); for (idx = 0; idx < task_pri; idx++) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0ab79d819a0d..86dea6a05267 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -124,15 +124,12 @@ static inline int dl_bw_cpus(int i) return cpus; } -static inline unsigned long __dl_bw_capacity(int i) +static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) { - struct root_domain *rd = cpu_rq(i)->rd; unsigned long cap = 0; + int i; - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), - "sched RCU must be held"); - - for_each_cpu_and(i, rd->span, cpu_active_mask) + for_each_cpu_and(i, mask, cpu_active_mask) cap += capacity_orig_of(i); return cap; @@ -144,11 +141,14 @@ static inline unsigned long __dl_bw_capacity(int i) */ static inline unsigned long dl_bw_capacity(int i) { - if (!static_branch_unlikely(&sched_asym_cpucapacity) && + if (!sched_asym_cpucap_active() && capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { - return __dl_bw_capacity(i); + RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), + "sched RCU must be held"); + + return __dl_bw_capacity(cpu_rq(i)->rd->span); } } @@ -310,7 +310,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; - BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); if (task_on_rq_queued(p)) return; @@ -431,8 +431,8 @@ static void task_non_contending(struct task_struct *p) sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); + __dl_clear_params(p); } return; @@ -607,7 +607,7 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct rb_node *leftmost; - BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); leftmost = rb_add_cached(&p->pushable_dl_tasks, &rq->dl.pushable_dl_tasks_root, @@ -684,7 +684,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p * Failed to find any suitable CPU. * The task will never come back! */ - BUG_ON(dl_bandwidth_enabled()); + WARN_ON_ONCE(dl_bandwidth_enabled()); /* * If admission control is disabled we @@ -770,6 +770,14 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + struct rq *rq) +{ + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; +} + /* * We are being explicitly informed that a new instance is starting, * and this means that: @@ -803,8 +811,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * future; in fact, we must consider execution overheads (time * spent on hardirq context, etc.). */ - dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline; - dl_se->runtime = dl_se->dl_runtime; + replenish_dl_new_period(dl_se, rq); } /* @@ -830,16 +837,14 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_of(dl_se)->dl_runtime <= 0); + WARN_ON_ONCE(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ - if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; - } + if (dl_se->dl_deadline == 0) + replenish_dl_new_period(dl_se, rq); if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -866,8 +871,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } if (dl_se->dl_yielded) @@ -1024,8 +1028,7 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) return; } - dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; - dl_se->runtime = pi_of(dl_se)->dl_runtime; + replenish_dl_new_period(dl_se, rq); } } @@ -1333,11 +1336,7 @@ static void update_curr_dl(struct rq *rq) trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (dl_entity_is_special(dl_se)) return; @@ -1616,7 +1615,7 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); + WARN_ON_ONCE(!RB_EMPTY_NODE(&dl_se->rb_node)); rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); @@ -1640,7 +1639,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { - BUG_ON(on_dl_rq(dl_se)); + WARN_ON_ONCE(on_dl_rq(dl_se)); update_stats_enqueue_dl(dl_rq_of_se(dl_se), dl_se, flags); @@ -1814,6 +1813,14 @@ static void yield_task_dl(struct rq *rq) #ifdef CONFIG_SMP +static inline bool dl_task_is_earliest_deadline(struct task_struct *p, + struct rq *rq) +{ + return (!rq->dl.dl_nr_running || + dl_time_before(p->dl.deadline, + rq->dl.earliest_dl.curr)); +} + static int find_later_rq(struct task_struct *task); static int @@ -1849,16 +1856,14 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) * Take the capacity of the CPU into account to * ensure it fits the requirement of the task. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) + if (sched_asym_cpucap_active()) select_rq |= !dl_task_fits_capacity(p, cpu); if (select_rq) { int target = find_later_rq(p); if (target != -1 && - (dl_time_before(p->dl.deadline, - cpu_rq(target)->dl.earliest_dl.curr) || - (cpu_rq(target)->dl.dl_nr_running == 0))) + dl_task_is_earliest_deadline(p, cpu_rq(target))) cpu = target; } rcu_read_unlock(); @@ -2017,7 +2022,7 @@ static struct task_struct *pick_task_dl(struct rq *rq) return NULL; dl_se = pick_next_dl_entity(dl_rq); - BUG_ON(!dl_se); + WARN_ON_ONCE(!dl_se); p = dl_task_of(dl_se); return p; @@ -2087,7 +2092,7 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -2225,9 +2230,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) later_rq = cpu_rq(cpu); - if (later_rq->dl.dl_nr_running && - !dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) { + if (!dl_task_is_earliest_deadline(task, later_rq)) { /* * Target rq has tasks of equal or earlier deadline, * retrying does not release any lock and is unlikely @@ -2241,7 +2244,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || + task_on_cpu(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { double_unlock_balance(rq, later_rq); @@ -2255,9 +2258,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) * its earliest one has a later deadline than our * task, the rq is a good one. */ - if (!later_rq->dl.dl_nr_running || - dl_time_before(task->dl.deadline, - later_rq->dl.earliest_dl.curr)) + if (dl_task_is_earliest_deadline(task, later_rq)) break; /* Otherwise we try again. */ @@ -2277,12 +2278,12 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq) p = __node_2_pdl(rb_first_cached(&rq->dl.pushable_dl_tasks_root)); - BUG_ON(rq->cpu != task_cpu(p)); - BUG_ON(task_current(rq, p)); - BUG_ON(p->nr_cpus_allowed <= 1); + WARN_ON_ONCE(rq->cpu != task_cpu(p)); + WARN_ON_ONCE(task_current(rq, p)); + WARN_ON_ONCE(p->nr_cpus_allowed <= 1); - BUG_ON(!task_on_rq_queued(p)); - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!task_on_rq_queued(p)); + WARN_ON_ONCE(!dl_task(p)); return p; } @@ -2428,9 +2429,7 @@ static void pull_dl_task(struct rq *this_rq) * - it will preempt the last one we pulled (if any). */ if (p && dl_time_before(p->dl.deadline, dmin) && - (!this_rq->dl.dl_nr_running || - dl_time_before(p->dl.deadline, - this_rq->dl.earliest_dl.curr))) { + dl_task_is_earliest_deadline(p, this_rq)) { WARN_ON(p == src_rq->curr); WARN_ON(!task_on_rq_queued(p)); @@ -2475,7 +2474,7 @@ skip: */ static void task_woken_dl(struct rq *rq, struct task_struct *p) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && dl_task(rq->curr) && @@ -2492,7 +2491,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, struct root_domain *src_rd; struct rq *rq; - BUG_ON(!dl_task(p)); + WARN_ON_ONCE(!dl_task(p)); rq = task_rq(p); src_rd = rq->rd; @@ -3007,17 +3006,15 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) { - int ret = 1, trial_cpus; + unsigned long flags, cap; struct dl_bw *cur_dl_b; - unsigned long flags; + int ret = 1; rcu_read_lock_sched(); cur_dl_b = dl_bw_of(cpumask_any(cur)); - trial_cpus = cpumask_weight(trial); - + cap = __dl_bw_capacity(trial); raw_spin_lock_irqsave(&cur_dl_b->lock, flags); - if (cur_dl_b->bw != -1 && - cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw) + if (__dl_overflow(cur_dl_b, cap, 0, 0)) ret = 0; raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); rcu_read_unlock_sched(); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 667876da8382..1637b65ba07a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -333,6 +333,7 @@ static __init int sched_init_debug(void) debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min); debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max); debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size); + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); #endif debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 914096c5b1ae..e4a0b8bd941c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -40,6 +40,7 @@ #include <linux/cpuidle.h> #include <linux/interrupt.h> +#include <linux/memory-tiers.h> #include <linux/mempolicy.h> #include <linux/mutex_api.h> #include <linux/profile.h> @@ -799,8 +800,6 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } -static void attach_entity_cfs_rq(struct sched_entity *se); - /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -835,20 +834,6 @@ void post_init_entity_util_avg(struct task_struct *p) long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; - if (cap > 0) { - if (cfs_rq->avg.util_avg != 0) { - sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; - sa->util_avg /= (cfs_rq->avg.load_avg + 1); - - if (sa->util_avg > cap) - sa->util_avg = cap; - } else { - sa->util_avg = cap; - } - } - - sa->runnable_avg = sa->util_avg; - if (p->sched_class != &fair_sched_class) { /* * For !fair tasks do: @@ -864,7 +849,19 @@ void post_init_entity_util_avg(struct task_struct *p) return; } - attach_entity_cfs_rq(se); + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap; + } + } + + sa->runnable_avg = sa->util_avg; } #else /* !CONFIG_SMP */ @@ -1094,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1436,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + wmark_pages(zone, WMARK_PROMO) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct page *page) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = xchg_page_access_time(page, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1443,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(src_nid)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(page); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; + /* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for @@ -1592,11 +1744,11 @@ numa_type numa_classify(unsigned int imbalance_pct, #ifdef CONFIG_SCHED_SMT /* Forward declarations of select_idle_sibling helpers */ -static inline bool test_idle_cores(int cpu, bool def); +static inline bool test_idle_cores(int cpu); static inline int numa_idle_core(int idle_core, int cpu) { if (!static_branch_likely(&sched_smt_present) || - idle_core >= 0 || !test_idle_cores(cpu, false)) + idle_core >= 0 || !test_idle_cores(cpu)) return idle_core; /* @@ -2600,7 +2752,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, if (!join) return; - BUG_ON(irqs_disabled()); + WARN_ON_ONCE(irqs_disabled()); double_lock_irq(&my_grp->lock, &grp->lock); for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) { @@ -2685,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2765,6 +2926,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2821,13 +2983,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; @@ -3838,8 +4003,7 @@ static void migrate_se_pelt_lag(struct sched_entity *se) {} * @cfs_rq: cfs_rq to update * * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) - * avg. The immediate corollary is that all (fair) tasks must be attached, see - * post_init_entity_util_avg(). + * avg. The immediate corollary is that all (fair) tasks must be attached. * * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. * @@ -4003,6 +4167,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s #define UPDATE_TG 0x1 #define SKIP_AGE_LOAD 0x2 #define DO_ATTACH 0x4 +#define DO_DETACH 0x8 /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) @@ -4032,6 +4197,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); + } else if (flags & DO_DETACH) { + /* + * DO_DETACH means we're here from dequeue_entity() + * and we are migrating task out of the CPU. + */ + detach_entity_load_avg(cfs_rq, se); + update_tg_load_avg(cfs_rq); } else if (decayed) { cfs_rq_util_change(cfs_rq, 0); @@ -4064,8 +4236,8 @@ static void remove_entity_load_avg(struct sched_entity *se) /* * tasks cannot exit without having gone through wake_up_new_task() -> - * post_init_entity_util_avg() which will have added things to the - * cfs_rq, so we can remove unconditionally. + * enqueue_task_fair() which will have added things to the cfs_rq, + * so we can remove unconditionally. */ sync_entity_load_avg(se); @@ -4262,7 +4434,7 @@ static inline int task_fits_capacity(struct task_struct *p, static inline void update_misfit_status(struct task_struct *p, struct rq *rq) { - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return; if (!p || p->nr_cpus_allowed == 1) { @@ -4292,6 +4464,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 +#define DO_DETACH 0x0 static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1) { @@ -4434,7 +4607,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - For group_entity, update its weight to reflect the new share of * its group cfs_rq * - Add its new weight to cfs_rq->load.weight @@ -4511,6 +4685,11 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + int action = UPDATE_TG; + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + /* * Update run-time statistics of the 'current'. */ @@ -4519,12 +4698,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. - * - Subtract its load from the cfs_rq->runnable_avg. + * - For group_entity, update its runnable_weight to reflect the new + * h_nr_running of its group cfs_rq. * - Subtract its previous weight from cfs_rq->load.weight. * - For group entity, update its weight to reflect the new share * of its group cfs_rq. */ - update_load_avg(cfs_rq, se, UPDATE_TG); + update_load_avg(cfs_rq, se, action); se_update_runnable(se); update_stats_dequeue_fair(cfs_rq, se, flags); @@ -5893,8 +6073,8 @@ dequeue_throttle: #ifdef CONFIG_SMP /* Working cpumask for: load_balance, load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); -DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); +static DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); +static DEFINE_PER_CPU(cpumask_var_t, select_rq_mask); #ifdef CONFIG_NO_HZ_COMMON @@ -6260,7 +6440,7 @@ static inline void set_idle_cores(int cpu, int val) WRITE_ONCE(sds->has_idle_cores, val); } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { struct sched_domain_shared *sds; @@ -6268,7 +6448,7 @@ static inline bool test_idle_cores(int cpu, bool def) if (sds) return READ_ONCE(sds->has_idle_cores); - return def; + return false; } /* @@ -6284,7 +6464,7 @@ void __update_idle_core(struct rq *rq) int cpu; rcu_read_lock(); - if (test_idle_cores(core, true)) + if (test_idle_cores(core)) goto unlock; for_each_cpu(cpu, cpu_smt_mask(core)) { @@ -6310,9 +6490,6 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu bool idle = true; int cpu; - if (!static_branch_likely(&sched_smt_present)) - return __select_idle_cpu(core, p); - for_each_cpu(cpu, cpu_smt_mask(core)) { if (!available_idle_cpu(cpu)) { idle = false; @@ -6339,13 +6516,12 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu /* * Scan the local SMT mask for idle CPUs. */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_smt(struct task_struct *p, int target) { int cpu; - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) + for_each_cpu_and(cpu, cpu_smt_mask(target), p->cpus_ptr) { + if (cpu == target) continue; if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) return cpu; @@ -6360,9 +6536,9 @@ static inline void set_idle_cores(int cpu, int val) { } -static inline bool test_idle_cores(int cpu, bool def) +static inline bool test_idle_cores(int cpu) { - return def; + return false; } static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) @@ -6370,7 +6546,7 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma return __select_idle_cpu(core, p); } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_smt(struct task_struct *p, int target) { return -1; } @@ -6389,19 +6565,19 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct sched_domain_shared *sd_share; struct rq *this_rq = this_rq(); int this = smp_processor_id(); - struct sched_domain *this_sd; + struct sched_domain *this_sd = NULL; u64 time = 0; - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); if (sched_feat(SIS_PROP) && !has_idle_core) { u64 avg_cost, avg_idle, span_avg; unsigned long now = jiffies; + this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); + if (!this_sd) + return -1; + /* * If we're busy, the assumption that the last idle period * predicts the future is flawed; age away the remaining @@ -6455,7 +6631,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && !has_idle_core) { + if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { time = cpu_clock(this) - time; /* @@ -6506,7 +6682,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) static inline bool asym_fits_capacity(unsigned long task_util, int cpu) { - if (static_branch_unlikely(&sched_asym_cpucapacity)) + if (sched_asym_cpucap_active()) return fits_capacity(task_util, capacity_of(cpu)); return true; @@ -6526,7 +6702,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * On asymmetric system, update task utilization because we will check * that the task fits with cpu's capacity. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); task_util = uclamp_task_util(p); } @@ -6580,7 +6756,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * For asymmetric CPU capacity systems, our domain of interest is * sd_asym_cpucapacity rather than sd_llc. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); /* * On an asymmetric CPU capacity system where an exclusive @@ -6601,10 +6777,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) return target; if (sched_smt_active()) { - has_idle_core = test_idle_cores(target, false); + has_idle_core = test_idle_cores(target); if (!has_idle_core && cpus_share_cache(prev, target)) { - i = select_idle_smt(p, sd, prev); + i = select_idle_smt(p, prev); if ((unsigned int)i < nr_cpumask_bits) return i; } @@ -7076,8 +7252,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) return new_cpu; } -static void detach_entity_cfs_rq(struct sched_entity *se); - /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the @@ -7099,15 +7273,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) se->vruntime -= u64_u32_load(cfs_rq->min_vruntime); } - if (p->on_rq == TASK_ON_RQ_MIGRATING) { - /* - * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' - * rq->lock and can modify state directly. - */ - lockdep_assert_rq_held(task_rq(p)); - detach_entity_cfs_rq(se); - - } else { + if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); /* @@ -7279,7 +7445,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; find_matching_se(&se, &pse); - BUG_ON(!pse); + WARN_ON_ONCE(!pse); cse_is_idle = se_is_idle(se); pse_is_idle = se_is_idle(pse); @@ -7938,7 +8104,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* Record that we found at least one task that could run on dst_cpu */ env->flags &= ~LBF_ALL_PINNED; - if (task_running(env->src_rq, p)) { + if (task_on_cpu(env->src_rq, p)) { schedstat_inc(p->stats.nr_failed_migrations_running); return 0; } @@ -8012,8 +8178,6 @@ static struct task_struct *detach_one_task(struct lb_env *env) return NULL; } -static const unsigned int sched_nr_migrate_break = 32; - /* * detach_tasks() -- tries to detach up to imbalance load/util/tasks from * busiest_rq, as part of a balancing operation within domain "sd". @@ -8049,20 +8213,24 @@ static int detach_tasks(struct lb_env *env) if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) break; - p = list_last_entry(tasks, struct task_struct, se.group_node); - env->loop++; - /* We've more or less seen every task there is, call it quits */ - if (env->loop > env->loop_max) + /* + * We've more or less seen every task there is, call it quits + * unless we haven't found any movable task yet. + */ + if (env->loop > env->loop_max && + !(env->flags & LBF_ALL_PINNED)) break; /* take a breather every nr_migrate tasks */ if (env->loop > env->loop_break) { - env->loop_break += sched_nr_migrate_break; + env->loop_break += SCHED_NR_MIGRATE_BREAK; env->flags |= LBF_NEED_BREAK; break; } + p = list_last_entry(tasks, struct task_struct, se.group_node); + if (!can_migrate_task(p, env)) goto next; @@ -8159,7 +8327,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) { lockdep_assert_rq_held(rq); - BUG_ON(task_rq(p) != rq); + WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); check_preempt_curr(rq, p, 0); } @@ -10099,14 +10267,13 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct rq *busiest; struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); - struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_span(sd->groups), .idle = idle, - .loop_break = sched_nr_migrate_break, + .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), @@ -10134,7 +10301,7 @@ redo: goto out_balanced; } - BUG_ON(busiest == env.dst_rq); + WARN_ON_ONCE(busiest == env.dst_rq); schedstat_add(sd->lb_imbalance[idle], env.imbalance); @@ -10182,7 +10349,9 @@ more_balance: if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; - goto more_balance; + /* Stop if we tried all running tasks */ + if (env.loop < busiest->nr_running) + goto more_balance; } /* @@ -10213,7 +10382,7 @@ more_balance: env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_DST_PINNED; env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; /* * Go back to "more_balance" rather than "redo" since we @@ -10245,7 +10414,7 @@ more_balance: */ if (!cpumask_subset(cpus, env.dst_grpmask)) { env.loop = 0; - env.loop_break = sched_nr_migrate_break; + env.loop_break = SCHED_NR_MIGRATE_BREAK; goto redo; } goto out_all_pinned; @@ -10430,7 +10599,7 @@ static int active_load_balance_cpu_stop(void *data) * we need to fix it. Originally reported by * Bjorn Helgaas on a 128-CPU setup. */ - BUG_ON(busiest_rq == target_rq); + WARN_ON_ONCE(busiest_rq == target_rq); /* Search for an sd spanning us and the target CPU. */ rcu_read_lock(); @@ -10916,8 +11085,7 @@ static bool update_nohz_stats(struct rq *rq) * can be a simple update of blocked load or a complete load balance with * tasks movement depending of flags. */ -static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags, - enum cpu_idle_type idle) +static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) { /* Earliest time when we have to do rebalance again */ unsigned long now = jiffies; @@ -11032,7 +11200,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (idle != CPU_IDLE) return false; - _nohz_idle_balance(this_rq, flags, idle); + _nohz_idle_balance(this_rq, flags); return true; } @@ -11052,7 +11220,7 @@ void nohz_run_idle_balance(int cpu) * (ie NOHZ_STATS_KICK set) and will do the same. */ if ((flags == NOHZ_NEWILB_KICK) && !need_resched()) - _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE); + _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK); } static void nohz_newidle_balance(struct rq *this_rq) @@ -11552,6 +11720,17 @@ static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); +#ifdef CONFIG_SMP + /* + * In case the task sched_avg hasn't been attached: + * - A forked task which hasn't been woken up by wake_up_new_task(). + * - A task which has been woken up by try_to_wake_up() but is + * waiting for actually being woken up by sched_ttwu_pending(). + */ + if (!se->avg.last_update_time) + return; +#endif + /* Catch up with the cfs_rq and remove our load when we leave */ update_load_avg(cfs_rq, se, 0); detach_entity_load_avg(cfs_rq, se); @@ -11563,14 +11742,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * Since the real-depth could have been changed (only FAIR - * class maintain depth value), reset depth properly. - */ - se->depth = se->parent ? se->parent->depth + 1 : 0; -#endif - /* Synchronize entity with its cfs_rq */ update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); attach_entity_load_avg(cfs_rq, se); @@ -11666,39 +11837,25 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED -static void task_set_group_fair(struct task_struct *p) +static void task_change_group_fair(struct task_struct *p) { - struct sched_entity *se = &p->se; - - set_task_rq(p, task_cpu(p)); - se->depth = se->parent ? se->parent->depth + 1 : 0; -} + /* + * We couldn't detach or attach a forked task which + * hasn't been woken up by wake_up_new_task(). + */ + if (READ_ONCE(p->__state) == TASK_NEW) + return; -static void task_move_group_fair(struct task_struct *p) -{ detach_task_cfs_rq(p); - set_task_rq(p, task_cpu(p)); #ifdef CONFIG_SMP /* Tell se's cfs_rq has been changed -- migrated */ p->se.avg.last_update_time = 0; #endif + set_task_rq(p, task_cpu(p)); attach_task_cfs_rq(p); } -static void task_change_group_fair(struct task_struct *p, int type) -{ - switch (type) { - case TASK_SET_GROUP: - task_set_group_fair(p); - break; - - case TASK_MOVE_GROUP: - task_move_group_fair(p); - break; - } -} - void free_fair_sched_group(struct task_group *tg) { int i; @@ -12075,6 +12232,13 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP + int i; + + for_each_possible_cpu(i) { + zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); + zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + } + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index ecb4b4ff4ce0..ee2ecc081422 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -181,6 +181,7 @@ static void group_init(struct psi_group *group) { int cpu; + group->enabled = true; for_each_possible_cpu(cpu) seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); group->avg_last_update = sched_clock(); @@ -201,6 +202,7 @@ void __init psi_init(void) { if (!psi_enable) { static_branch_enable(&psi_disabled); + static_branch_disable(&psi_cgroups_enabled); return; } @@ -211,7 +213,7 @@ void __init psi_init(void) group_init(&psi_system); } -static bool test_state(unsigned int *tasks, enum psi_states state) +static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu) { switch (state) { case PSI_IO_SOME: @@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return unlikely(tasks[NR_MEMSTALL] && tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: - return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] > oncpu); case PSI_CPU_FULL: - return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); + return unlikely(tasks[NR_RUNNING] && !oncpu); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu, bool wake_clock) { struct psi_group_cpu *groupc; - u32 state_mask = 0; unsigned int t, m; enum psi_states s; + u32 state_mask; groupc = per_cpu_ptr(group->pcpu, cpu); /* - * First we assess the aggregate resource states this CPU's - * tasks have been in since the last change, and account any - * SOME and FULL time these may have resulted in. - * - * Then we update the task counts according to the state + * First we update the task counts according to the state * change requested through the @clear and @set bits. + * + * Then if the cgroup PSI stats accounting enabled, we + * assess the aggregate resource states this CPU's tasks + * have been in since the last change, and account any + * SOME and FULL time these may have resulted in. */ write_seqcount_begin(&groupc->seq); - record_times(groupc, now); + /* + * Start with TSK_ONCPU, which doesn't have a corresponding + * task count - it's just a boolean flag directly encoded in + * the state mask. Clear, set, or carry the current state if + * no changes are requested. + */ + if (unlikely(clear & TSK_ONCPU)) { + state_mask = 0; + clear &= ~TSK_ONCPU; + } else if (unlikely(set & TSK_ONCPU)) { + state_mask = PSI_ONCPU; + set &= ~TSK_ONCPU; + } else { + state_mask = groupc->state_mask & PSI_ONCPU; + } + /* + * The rest of the state mask is calculated based on the task + * counts. Update those first, then construct the mask. + */ for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) continue; if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], groupc->tasks[4], - clear, set); + groupc->tasks[3], clear, set); psi_bug = 1; } } @@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu, if (set & (1 << t)) groupc->tasks[t]++; - /* Calculate state mask representing active states */ + if (!group->enabled) { + /* + * On the first group change after disabling PSI, conclude + * the current state and flush its time. This is unlikely + * to matter to the user, but aggregation (get_recent_times) + * may have already incorporated the live state into times_prev; + * avoid a delta sample underflow when PSI is later re-enabled. + */ + if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE))) + record_times(groupc, now); + + groupc->state_mask = state_mask; + + write_seqcount_end(&groupc->seq); + return; + } + for (s = 0; s < NR_PSI_STATES; s++) { - if (test_state(groupc->tasks, s)) + if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU)) state_mask |= (1 << s); } @@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) + if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL); + record_times(groupc, now); + groupc->state_mask = state_mask; write_seqcount_end(&groupc->seq); @@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu, schedule_delayed_work(&group->avgs_work, PSI_FREQ); } -static struct psi_group *iterate_groups(struct task_struct *task, void **iter) +static inline struct psi_group *task_psi_group(struct task_struct *task) { - if (*iter == &psi_system) - return NULL; - #ifdef CONFIG_CGROUPS - if (static_branch_likely(&psi_cgroups_enabled)) { - struct cgroup *cgroup = NULL; - - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else - cgroup = cgroup_parent(*iter); - - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); - } - } + if (static_branch_likely(&psi_cgroups_enabled)) + return cgroup_psi(task_dfl_cgroup(task)); #endif - *iter = &psi_system; return &psi_system; } @@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set) { int cpu = task_cpu(task); struct psi_group *group; - bool wake_clock = true; - void *iter = NULL; u64 now; if (!task->pid) @@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set) psi_flags_change(task, clear, set); now = cpu_clock(cpu); - /* - * Periodic aggregation shuts off if there is a period of no - * task changes, so we wake it back up if necessary. However, - * don't do this if the task change is the aggregation worker - * itself going to sleep, or we'll ping-pong forever. - */ - if (unlikely((clear & TSK_RUNNING) && - (task->flags & PF_WQ_WORKER) && - wq_worker_last_func(task) == psi_avgs_work)) - wake_clock = false; - while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, now, wake_clock); + group = task_psi_group(task); + do { + psi_group_change(group, cpu, clear, set, now, true); + } while ((group = group->parent)); } void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, { struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); - void *iter; u64 now = cpu_clock(cpu); if (next->pid) { - bool identical_state; - psi_flags_change(next, 0, TSK_ONCPU); /* - * When switching between tasks that have an identical - * runtime state, the cgroup that contains both tasks - * we reach the first common ancestor. Iterate @next's - * ancestors only until we encounter @prev's ONCPU. + * Set TSK_ONCPU on @next's cgroups. If @next shares any + * ancestors with @prev, those will already have @prev's + * TSK_ONCPU bit set, and we can stop the iteration there. */ - identical_state = prev->psi_flags == next->psi_flags; - iter = NULL; - while ((group = iterate_groups(next, &iter))) { - if (identical_state && - per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + group = task_psi_group(next); + do { + if (per_cpu_ptr(group->pcpu, cpu)->state_mask & + PSI_ONCPU) { common = group; break; } psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); - } + } while ((group = group->parent)); } if (prev->pid) { int clear = TSK_ONCPU, set = 0; + bool wake_clock = true; /* * When we're going to sleep, psi_dequeue() lets us @@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; + + /* + * Periodic aggregation shuts off if there is a period of no + * task changes, so we wake it back up if necessary. However, + * don't do this if the task change is the aggregation worker + * itself going to sleep, or we'll ping-pong forever. + */ + if (unlikely((prev->flags & PF_WQ_WORKER) && + wq_worker_last_func(prev) == psi_avgs_work)) + wake_clock = false; } psi_flags_change(prev, clear, set); - iter = NULL; - while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, now, true); + group = task_psi_group(prev); + do { + if (group == common) + break; + psi_group_change(group, cpu, clear, set, now, wake_clock); + } while ((group = group->parent)); /* - * TSK_ONCPU is handled up to the common ancestor. If we're tasked - * with dequeuing too, finish that for the rest of the hierarchy. + * TSK_ONCPU is handled up to the common ancestor. If there are + * any other differences between the two tasks (e.g. prev goes + * to sleep, or only one task is memstall), finish propagating + * those differences all the way up to the root. */ - if (sleep) { + if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) { clear &= ~TSK_ONCPU; - for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, now, true); + for (; group; group = group->parent) + psi_group_change(group, cpu, clear, set, now, wake_clock); } } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + group = task_psi_group(task); + do { + if (!group->enabled) + continue; + + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } while ((group = group->parent)); +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -917,6 +974,7 @@ void psi_memstall_enter(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_enter); /** * psi_memstall_leave - mark the end of an memory stall section @@ -946,11 +1004,12 @@ void psi_memstall_leave(unsigned long *flags) rq_unlock_irq(rq, &rf); } +EXPORT_SYMBOL_GPL(psi_memstall_leave); #ifdef CONFIG_CGROUPS int psi_cgroup_alloc(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return 0; cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL); @@ -963,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup) return -ENOMEM; } group_init(cgroup->psi); + cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup)); return 0; } void psi_cgroup_free(struct cgroup *cgroup) { - if (static_branch_likely(&psi_disabled)) + if (!static_branch_likely(&psi_cgroups_enabled)) return; cancel_delayed_work_sync(&cgroup->psi->avgs_work); @@ -996,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) struct rq_flags rf; struct rq *rq; - if (static_branch_likely(&psi_disabled)) { + if (!static_branch_likely(&psi_cgroups_enabled)) { /* * Lame to do this here, but the scheduler cannot be locked * from the outside, so we move cgroups from inside sched/. @@ -1044,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) task_rq_unlock(rq, task, &rf); } + +void psi_cgroup_restart(struct psi_group *group) +{ + int cpu; + + /* + * After we disable psi_group->enabled, we don't actually + * stop percpu tasks accounting in each psi_group_cpu, + * instead only stop test_state() loop, record_times() + * and averaging worker, see psi_group_change() for details. + * + * When disable cgroup PSI, this function has nothing to sync + * since cgroup pressure files are hidden and percpu psi_group_cpu + * would see !psi_group->enabled and only do task accounting. + * + * When re-enable cgroup PSI, this function use psi_group_change() + * to get correct state mask from test_state() loop on tasks[], + * and restart groupc->state_start from now, use .clear = .set = 0 + * here since no task status really changed. + */ + if (!group->enabled) + return; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + u64 now; + + rq_lock_irq(rq, &rf); + now = cpu_clock(cpu); + psi_group_change(group, cpu, 0, 0, now, true); + rq_unlock_irq(rq, &rf); + } +} #endif /* CONFIG_CGROUPS */ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1062,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1076,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1104,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1388,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1395,6 +1526,9 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 55f39c8f4203..d869bcf898cc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -509,7 +509,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) unsigned int cpu_cap; /* Only heterogeneous systems can benefit from this check */ - if (!static_branch_unlikely(&sched_asym_cpucapacity)) + if (!sched_asym_cpucap_active()) return true; min_cap = uclamp_eff_value(p, UCLAMP_MIN); @@ -843,7 +843,7 @@ static void __disable_runtime(struct rq *rq) * We cannot be left wanting - that would mean some runtime * leaked out of the system. */ - BUG_ON(want); + WARN_ON_ONCE(want); balanced: /* * Disable all the borrow logic by pretending we have inf @@ -1062,11 +1062,7 @@ static void update_curr_rt(struct rq *rq) trace_sched_stat_runtime(curr, delta_exec, 0); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = now; - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); if (!rt_bandwidth_enabled()) return; @@ -1849,7 +1845,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { - if (!task_running(rq, p) && + if (!task_on_cpu(rq, p) && cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; @@ -1897,7 +1893,7 @@ static int find_lowest_rq(struct task_struct *task) * If we're on asym system ensure we consider the different capacities * of the CPUs when searching for the lowest_mask. */ - if (static_branch_unlikely(&sched_asym_cpucapacity)) { + if (sched_asym_cpucap_active()) { ret = cpupri_find_fitness(&task_rq(task)->rd->cpupri, task, lowest_mask, @@ -2004,7 +2000,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) */ if (unlikely(task_rq(task) != rq || !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || - task_running(rq, task) || + task_on_cpu(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { @@ -2462,7 +2458,7 @@ skip: */ static void task_woken_rt(struct rq *rq, struct task_struct *p) { - bool need_to_push = !task_running(rq, p) && + bool need_to_push = !task_on_cpu(rq, p) && !test_tsk_need_resched(rq->curr) && p->nr_cpus_allowed > 1 && (dl_task(rq->curr) || rt_task(rq->curr)) && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e26688d387ae..1644242ecd11 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,21 +321,6 @@ struct dl_bw { u64 total_bw; }; -/* - * Verify the fitness of task @p to run on @cpu taking into account the - * CPU original capacity and the runtime/deadline ratio of the task. - * - * The function will return true if the CPU original capacity of the - * @cpu scaled by SCHED_CAPACITY_SCALE >= runtime/deadline ratio of the - * task and false otherwise. - */ -static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) -{ - unsigned long cap = arch_scale_cpu_capacity(cpu); - - return cap_scale(p->dl.dl_deadline, cap) >= p->dl.dl_runtime; -} - extern void init_dl_bw(struct dl_bw *dl_b); extern int sched_dl_global_validate(void); extern void sched_dl_do_global(void); @@ -1815,6 +1800,11 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +static __always_inline bool sched_asym_cpucap_active(void) +{ + return static_branch_unlikely(&sched_asym_cpucapacity); +} + struct sched_group_capacity { atomic_t ref; /* @@ -1942,6 +1932,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; + p->se.depth = tg->se[cpu] ? tg->se[cpu]->depth + 1 : 0; #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -2060,7 +2051,7 @@ static inline int task_current(struct rq *rq, struct task_struct *p) return rq->curr == p; } -static inline int task_running(struct rq *rq, struct task_struct *p) +static inline int task_on_cpu(struct rq *rq, struct task_struct *p) { #ifdef CONFIG_SMP return p->on_cpu; @@ -2204,11 +2195,8 @@ struct sched_class { void (*update_curr)(struct rq *rq); -#define TASK_SET_GROUP 0 -#define TASK_MOVE_GROUP 1 - #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_change_group)(struct task_struct *p, int type); + void (*task_change_group)(struct task_struct *p); #endif }; @@ -2435,6 +2423,12 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +#ifdef CONFIG_PREEMPT_RT +#define SCHED_NR_MIGRATE_BREAK 8 +#else +#define SCHED_NR_MIGRATE_BREAK 32 +#endif + extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; @@ -2452,6 +2446,7 @@ extern unsigned int sysctl_numa_balancing_scan_delay; extern unsigned int sysctl_numa_balancing_scan_period_min; extern unsigned int sysctl_numa_balancing_scan_period_max; extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_hot_threshold; #endif #ifdef CONFIG_SCHED_HRTICK @@ -2709,8 +2704,8 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) __acquires(rq1->lock) __acquires(rq2->lock) { - BUG_ON(!irqs_disabled()); - BUG_ON(rq1 != rq2); + WARN_ON_ONCE(!irqs_disabled()); + WARN_ON_ONCE(rq1 != rq2); raw_spin_rq_lock(rq1); __acquire(rq2->lock); /* Fake it out ;) */ double_rq_clock_clear_update(rq1, rq2); @@ -2726,7 +2721,7 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) __releases(rq1->lock) __releases(rq2->lock) { - BUG_ON(rq1 != rq2); + WARN_ON_ONCE(rq1 != rq2); raw_spin_rq_unlock(rq1); __release(rq2->lock); } @@ -2896,6 +2891,21 @@ unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, enum cpu_util_type type, struct task_struct *p); +/* + * Verify the fitness of task @p to run on @cpu taking into account the + * CPU original capacity and the runtime/deadline ratio of the task. + * + * The function will return true if the original capacity of @cpu is + * greater than or equal to task's deadline density right shifted by + * (BW_SHIFT - SCHED_CAPACITY_SHIFT) and false otherwise. + */ +static inline bool dl_task_fits_capacity(struct task_struct *p, int cpu) +{ + unsigned long cap = arch_scale_cpu_capacity(cpu); + + return cap >= p->dl.dl_density >> (BW_SHIFT - SCHED_CAPACITY_SHIFT); +} + static inline unsigned long cpu_bw_dl(struct rq *rq) { return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; @@ -3157,4 +3167,14 @@ extern int sched_dynamic_mode(const char *str); extern void sched_dynamic_update(int mode); #endif +static inline void update_current_exec_runtime(struct task_struct *curr, + u64 now, u64 delta_exec) +{ + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = now; + cgroup_account_cputime(curr, delta_exec); +} + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baa839c1ba96..84a188913cc9 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se) } #ifdef CONFIG_PSI +void psi_task_change(struct task_struct *task, int clear, int set); +void psi_task_switch(struct task_struct *prev, struct task_struct *next, + bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); + /* * PSI tracks state that persists across sleeps, such as iowaits and * memory stalls. As a result, it has to distinguish between sleeps, @@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index d04073a93eb4..85590599b4d6 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -71,20 +71,17 @@ static void yield_task_stop(struct rq *rq) static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) { struct task_struct *curr = rq->curr; - u64 delta_exec; + u64 now, delta_exec; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; schedstat_set(curr->stats.exec_max, max(curr->stats.exec_max, delta_exec)); - curr->se.sum_exec_runtime += delta_exec; - account_group_exec_runtime(curr, delta_exec); - - curr->se.exec_start = rq_clock_task(rq); - cgroup_account_cputime(curr, delta_exec); + update_current_exec_runtime(curr, now, delta_exec); } /* diff --git a/kernel/signal.c b/kernel/signal.c index 6f86fda5e432..d140672185a4 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -913,8 +913,9 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) if (signal->core_state) return sig == SIGKILL; /* - * The process is in the middle of dying, nothing to do. + * The process is in the middle of dying, drop the signal. */ + return false; } else if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. @@ -2304,7 +2305,7 @@ static int ptrace_stop(int exit_code, int why, unsigned long message, read_unlock(&tasklist_lock); cgroup_enter_frozen(); preempt_enable_no_resched(); - freezable_schedule(); + schedule(); cgroup_leave_frozen(true); /* @@ -2473,7 +2474,7 @@ static bool do_signal_stop(int signr) /* Now we don't run again until woken by SIGCONT or SIGKILL */ cgroup_enter_frozen(); - freezable_schedule(); + schedule(); return true; } else { /* @@ -2548,11 +2549,11 @@ static void do_freezer_trap(void) * immediately (if there is a non-fatal signal pending), and * put the task into sleep. */ - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); clear_thread_flag(TIF_SIGPENDING); spin_unlock_irq(¤t->sighand->siglock); cgroup_enter_frozen(); - freezable_schedule(); + schedule(); } static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type) @@ -3600,9 +3601,9 @@ static int do_sigtimedwait(const sigset_t *which, kernel_siginfo_t *info, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); - __set_current_state(TASK_INTERRUPTIBLE); - ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, - HRTIMER_MODE_REL); + __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + ret = schedule_hrtimeout_range(to, tsk->timer_slack_ns, + HRTIMER_MODE_REL); spin_lock_irq(&tsk->sighand->siglock); __set_task_blocked(tsk, &tsk->real_blocked); sigemptyset(&tsk->real_blocked); diff --git a/kernel/smp.c b/kernel/smp.c index e8cdc025a046..06a413987a14 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -1069,7 +1069,7 @@ static int __init nrcpus(char *str) int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) - nr_cpu_ids = nr_cpus; + set_nr_cpu_ids(nr_cpus); return 0; } @@ -1087,14 +1087,16 @@ static int __init maxcpus(char *str) early_param("maxcpus", maxcpus); +#if (NR_CPUS > 1) && !defined(CONFIG_FORCE_NR_CPUS) /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); +#endif /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { - nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; + set_nr_cpu_ids(find_last_bit(cpumask_bits(cpu_possible_mask), NR_CPUS) + 1); } /* Called by boot processor to activate the rest. */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9f54544e749..2c7396da470c 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -433,7 +433,7 @@ bool cpu_wait_death(unsigned int cpu, int seconds) /* The outgoing CPU will normally get done quite quickly. */ if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) - goto update_state; + goto update_state_early; udelay(5); /* But if the outgoing CPU dawdles, wait increasingly long times. */ @@ -444,16 +444,17 @@ bool cpu_wait_death(unsigned int cpu, int seconds) break; sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); } -update_state: +update_state_early: oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +update_state: if (oldstate == CPU_DEAD) { /* Outgoing CPU died normally, update state. */ smp_mb(); /* atomic_read() before update. */ atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); } else { /* Outgoing CPU still hasn't died, set state accordingly. */ - if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, CPU_BROKEN) != oldstate) + if (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, CPU_BROKEN)) goto update_state; ret = false; } @@ -475,14 +476,14 @@ bool cpu_report_death(void) int newstate; int cpu = smp_processor_id(); + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); do { - oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); if (oldstate != CPU_BROKEN) newstate = CPU_DEAD; else newstate = CPU_DEAD_FROZEN; - } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), - oldstate, newstate) != oldstate); + } while (!atomic_try_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + &oldstate, newstate)); return newstate == CPU_DEAD; } diff --git a/kernel/sys.c b/kernel/sys.c index b911fa6d81ab..5fd54bf0e886 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -25,6 +25,7 @@ #include <linux/times.h> #include <linux/posix-timers.h> #include <linux/security.h> +#include <linux/random.h> #include <linux/suspend.h> #include <linux/tty.h> #include <linux/signal.h> @@ -496,7 +497,7 @@ static void flag_nproc_exceeded(struct cred *new) * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ - if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && + if (is_rlimit_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new->user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else @@ -1366,6 +1367,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; + add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->nodename, tmp, len); @@ -1419,6 +1421,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) if (!copy_from_user(tmp, name, len)) { struct new_utsname *u; + add_device_randomness(tmp, len); down_write(&uts_sem); u = utsname(); memcpy(u->domainname, tmp, len); diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c index 664ded05dd7a..6ef887c19c48 100644 --- a/kernel/sysctl-test.c +++ b/kernel/sysctl-test.c @@ -9,9 +9,6 @@ #define KUNIT_PROC_READ 0 #define KUNIT_PROC_WRITE 1 -static int i_zero; -static int i_one_hundred = 100; - /* * Test that proc_dointvec will not try to use a NULL .data field even when the * length is non-zero. @@ -29,8 +26,8 @@ static void sysctl_test_api_dointvec_null_tbl_data(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; /* * proc_dointvec expects a buffer in user space, so we allocate one. We @@ -79,8 +76,8 @@ static void sysctl_test_api_dointvec_table_maxlen_unset(struct kunit *test) .maxlen = 0, .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), GFP_USER); @@ -122,8 +119,8 @@ static void sysctl_test_api_dointvec_table_len_is_zero(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), GFP_USER); @@ -156,8 +153,8 @@ static void sysctl_test_api_dointvec_table_read_but_position_set( .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; void __user *buffer = (void __user *)kunit_kzalloc(test, sizeof(int), GFP_USER); @@ -191,8 +188,8 @@ static void sysctl_test_dointvec_read_happy_single_positive(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t len = 4; loff_t pos = 0; @@ -222,8 +219,8 @@ static void sysctl_test_dointvec_read_happy_single_negative(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t len = 5; loff_t pos = 0; @@ -251,8 +248,8 @@ static void sysctl_test_dointvec_write_happy_single_positive(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; char input[] = "9"; size_t len = sizeof(input) - 1; @@ -281,8 +278,8 @@ static void sysctl_test_dointvec_write_happy_single_negative(struct kunit *test) .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; char input[] = "-9"; size_t len = sizeof(input) - 1; @@ -313,8 +310,8 @@ static void sysctl_test_api_dointvec_write_single_less_int_min( .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t max_len = 32, len = max_len; loff_t pos = 0; @@ -351,8 +348,8 @@ static void sysctl_test_api_dointvec_write_single_greater_int_max( .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, - .extra1 = &i_zero, - .extra2 = &i_one_hundred, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE_HUNDRED, }; size_t max_len = 32, len = max_len; loff_t pos = 0; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 205d605cacc5..188c305aeb8b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -82,9 +82,16 @@ #include <linux/rtmutex.h> #endif +/* shared constants to be used in various sysctls */ +const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; +EXPORT_SYMBOL(sysctl_vals); + +const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; +EXPORT_SYMBOL_GPL(sysctl_long_vals); + #if defined(CONFIG_SYSCTL) -/* Constants used for minimum and maximum */ +/* Constants used for minimum and maximum */ #ifdef CONFIG_PERF_EVENTS static const int six_hundred_forty_kb = 640 * 1024; @@ -129,11 +136,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; int sysctl_legacy_va_layout; #endif -#ifdef CONFIG_COMPACTION -/* min_extfrag_threshold is SYSCTL_ZERO */; -static const int max_extfrag_threshold = 1000; -#endif - #endif /* CONFIG_SYSCTL */ /* @@ -1052,9 +1054,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, return 0; } - i = (unsigned long *) data; - min = (unsigned long *) table->extra1; - max = (unsigned long *) table->extra2; + i = data; + min = table->extra1; + max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; @@ -1641,6 +1643,14 @@ static struct ctl_table kern_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, }, + { + .procname = "numa_balancing_promote_rate_limit_MBps", + .data = &sysctl_numa_balancing_promote_rate_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, #endif /* CONFIG_NUMA_BALANCING */ { .procname = "panic", @@ -2216,7 +2226,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&max_extfrag_threshold, + .extra2 = SYSCTL_ONE_THOUSAND, }, { .procname = "compact_unevictable_allowed", diff --git a/kernel/task_work.c b/kernel/task_work.c index dff75bcde151..065e1ef8fc8d 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -47,12 +47,12 @@ int task_work_add(struct task_struct *task, struct callback_head *work, /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack(work); + head = READ_ONCE(task->task_works); do { - head = READ_ONCE(task->task_works); if (unlikely(head == &work_exited)) return -ESRCH; work->next = head; - } while (cmpxchg(&task->task_works, head, work) != head); + } while (!try_cmpxchg(&task->task_works, &head, work)); switch (notify) { case TWA_NONE: @@ -100,10 +100,12 @@ task_work_cancel_match(struct task_struct *task, * we raced with task_work_run(), *pprev == NULL/exited. */ raw_spin_lock_irqsave(&task->pi_lock, flags); - while ((work = READ_ONCE(*pprev))) { - if (!match(work, data)) + work = READ_ONCE(*pprev); + while (work) { + if (!match(work, data)) { pprev = &work->next; - else if (cmpxchg(pprev, work, work->next) == work) + work = READ_ONCE(*pprev); + } else if (try_cmpxchg(pprev, &work, work->next)) break; } raw_spin_unlock_irqrestore(&task->pi_lock, flags); @@ -151,16 +153,16 @@ void task_work_run(void) * work->func() can do task_work_add(), do not set * work_exited unless the list is empty. */ + work = READ_ONCE(task->task_works); do { head = NULL; - work = READ_ONCE(task->task_works); if (!work) { if (task->flags & PF_EXITING) head = &work_exited; else break; } - } while (cmpxchg(&task->task_works, work, head) != work); + } while (!try_cmpxchg(&task->task_works, &work, head)); if (!work) break; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index cee5da1e54c4..8058bec87ace 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -310,7 +310,7 @@ static void clocksource_verify_choose_cpus(void) * CPUs that are currently online. */ for (i = 1; i < n; i++) { - cpu = prandom_u32() % nr_cpu_ids; + cpu = prandom_u32_max(nr_cpu_ids); cpu = cpumask_next(cpu - 1, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(cpu_online_mask); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 23af5eca11b1..3ae661ab6260 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2037,11 +2037,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod struct restart_block *restart; do { - set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) - freezable_schedule(); + schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 688552df95ca..49fb9ec8366d 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1706,6 +1706,9 @@ BPF_CALL_4(bpf_read_branch_records, struct bpf_perf_event_data_kern *, ctx, if (unlikely(flags & ~BPF_F_GET_BRANCH_RECORDS_SIZE)) return -EINVAL; + if (unlikely(!(ctx->data->sample_flags & PERF_SAMPLE_BRANCH_STACK))) + return -ENOENT; + if (unlikely(!br_stack)) return -ENOENT; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 447d2e2a8549..fbf2543111c0 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1644,6 +1644,18 @@ ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_ex static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); +static bool skip_record(struct dyn_ftrace *rec) +{ + /* + * At boot up, weak functions are set to disable. Function tracing + * can be enabled before they are, and they still need to be disabled now. + * If the record is disabled, still continue if it is marked as already + * enabled (this is needed to keep the accounting working). + */ + return rec->flags & FTRACE_FL_DISABLED && + !(rec->flags & FTRACE_FL_ENABLED); +} + static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) @@ -1693,7 +1705,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; if (all) { @@ -2016,7 +2028,6 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, static void print_ip_ins(const char *fmt, const unsigned char *p) { char ins[MCOUNT_INSN_SIZE]; - int i; if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); @@ -2024,9 +2035,7 @@ static void print_ip_ins(const char *fmt, const unsigned char *p) } printk(KERN_CONT "%s", fmt); - - for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); + pr_cont("%*phC", MCOUNT_INSN_SIZE, ins); } enum ftrace_bug_type ftrace_bug_type; @@ -2126,7 +2135,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) ftrace_bug_type = FTRACE_BUG_UNKNOWN; - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) return FTRACE_UPDATE_IGNORE; /* @@ -2241,7 +2250,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) if (update) { /* If there's no more users, clear all flags */ if (!ftrace_rec_count(rec)) - rec->flags = 0; + rec->flags &= FTRACE_FL_DISABLED; else /* * Just disable the record, but keep the ops TRAMP @@ -2634,7 +2643,7 @@ void __weak ftrace_replace_code(int mod_flags) do_for_each_ftrace_rec(pg, rec) { - if (rec->flags & FTRACE_FL_DISABLED) + if (skip_record(rec)) continue; failed = __ftrace_replace_code(rec, enable); @@ -5427,6 +5436,8 @@ static struct ftrace_ops stub_ops = { * it is safe to modify the ftrace record, where it should be * currently calling @old_addr directly, to call @new_addr. * + * This is called with direct_mutex locked. + * * Safety checks should be made to make sure that the code at * @rec->ip is currently calling @old_addr. And this must * also update entry->direct to @new_addr. @@ -5439,6 +5450,8 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, unsigned long ip = rec->ip; int ret; + lockdep_assert_held(&direct_mutex); + /* * The ftrace_lock was used to determine if the record * had more than one registered user to it. If it did, @@ -5461,7 +5474,7 @@ int __weak ftrace_modify_direct_caller(struct ftrace_func_entry *entry, if (ret) goto out_lock; - ret = register_ftrace_function(&stub_ops); + ret = register_ftrace_function_nolock(&stub_ops); if (ret) { ftrace_set_filter_ip(&stub_ops, ip, 1, 0); goto out_lock; @@ -6081,8 +6094,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) if (filter_hash) { orig_hash = &iter->ops->func_hash->filter_hash; - if (iter->tr && !list_empty(&iter->tr->mod_trace)) - iter->hash->flags |= FTRACE_HASH_FL_MOD; + if (iter->tr) { + if (list_empty(&iter->tr->mod_trace)) + iter->hash->flags &= ~FTRACE_HASH_FL_MOD; + else + iter->hash->flags |= FTRACE_HASH_FL_MOD; + } } else orig_hash = &iter->ops->func_hash->notrace_hash; diff --git a/kernel/trace/kprobe_event_gen_test.c b/kernel/trace/kprobe_event_gen_test.c index 18b0f1cbb947..80e04a1e1977 100644 --- a/kernel/trace/kprobe_event_gen_test.c +++ b/kernel/trace/kprobe_event_gen_test.c @@ -35,6 +35,45 @@ static struct trace_event_file *gen_kprobe_test; static struct trace_event_file *gen_kretprobe_test; +#define KPROBE_GEN_TEST_FUNC "do_sys_open" + +/* X86 */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_32) +#define KPROBE_GEN_TEST_ARG0 "dfd=%ax" +#define KPROBE_GEN_TEST_ARG1 "filename=%dx" +#define KPROBE_GEN_TEST_ARG2 "flags=%cx" +#define KPROBE_GEN_TEST_ARG3 "mode=+4($stack)" + +/* ARM64 */ +#elif defined(CONFIG_ARM64) +#define KPROBE_GEN_TEST_ARG0 "dfd=%x0" +#define KPROBE_GEN_TEST_ARG1 "filename=%x1" +#define KPROBE_GEN_TEST_ARG2 "flags=%x2" +#define KPROBE_GEN_TEST_ARG3 "mode=%x3" + +/* ARM */ +#elif defined(CONFIG_ARM) +#define KPROBE_GEN_TEST_ARG0 "dfd=%r0" +#define KPROBE_GEN_TEST_ARG1 "filename=%r1" +#define KPROBE_GEN_TEST_ARG2 "flags=%r2" +#define KPROBE_GEN_TEST_ARG3 "mode=%r3" + +/* RISCV */ +#elif defined(CONFIG_RISCV) +#define KPROBE_GEN_TEST_ARG0 "dfd=%a0" +#define KPROBE_GEN_TEST_ARG1 "filename=%a1" +#define KPROBE_GEN_TEST_ARG2 "flags=%a2" +#define KPROBE_GEN_TEST_ARG3 "mode=%a3" + +/* others */ +#else +#define KPROBE_GEN_TEST_ARG0 NULL +#define KPROBE_GEN_TEST_ARG1 NULL +#define KPROBE_GEN_TEST_ARG2 NULL +#define KPROBE_GEN_TEST_ARG3 NULL +#endif + + /* * Test to make sure we can create a kprobe event, then add more * fields. @@ -58,14 +97,14 @@ static int __init test_gen_kprobe_cmd(void) * fields. */ ret = kprobe_event_gen_cmd_start(&cmd, "gen_kprobe_test", - "do_sys_open", - "dfd=%ax", "filename=%dx"); + KPROBE_GEN_TEST_FUNC, + KPROBE_GEN_TEST_ARG0, KPROBE_GEN_TEST_ARG1); if (ret) goto free; /* Use kprobe_event_add_fields to add the rest of the fields */ - ret = kprobe_event_add_fields(&cmd, "flags=%cx", "mode=+4($stack)"); + ret = kprobe_event_add_fields(&cmd, KPROBE_GEN_TEST_ARG2, KPROBE_GEN_TEST_ARG3); if (ret) goto free; @@ -128,7 +167,7 @@ static int __init test_gen_kretprobe_cmd(void) * Define the kretprobe event. */ ret = kretprobe_event_gen_cmd_start(&cmd, "gen_kretprobe_test", - "do_sys_open", + KPROBE_GEN_TEST_FUNC, "$retval"); if (ret) goto free; @@ -206,7 +245,7 @@ static void __exit kprobe_event_gen_test_exit(void) WARN_ON(kprobe_event_delete("gen_kprobe_test")); /* Disable the event or you can't remove it */ - WARN_ON(trace_array_set_clr_event(gen_kprobe_test->tr, + WARN_ON(trace_array_set_clr_event(gen_kretprobe_test->tr, "kprobes", "gen_kretprobe_test", false)); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index d59b6a328b7f..199759c73519 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -413,6 +413,7 @@ struct rb_irq_work { struct irq_work work; wait_queue_head_t waiters; wait_queue_head_t full_waiters; + long wait_index; bool waiters_pending; bool full_waiters_pending; bool wakeup_full; @@ -884,7 +885,7 @@ size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu) } /** - * ring_buffer_nr_pages_dirty - get the number of used pages in the ring buffer + * ring_buffer_nr_dirty_pages - get the number of used pages in the ring buffer * @buffer: The ring_buffer to get the number of pages from * @cpu: The cpu of the ring_buffer to get the number of pages from * @@ -917,13 +918,45 @@ static void rb_wake_up_waiters(struct irq_work *work) struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); wake_up_all(&rbwork->waiters); - if (rbwork->wakeup_full) { + if (rbwork->full_waiters_pending || rbwork->wakeup_full) { rbwork->wakeup_full = false; + rbwork->full_waiters_pending = false; wake_up_all(&rbwork->full_waiters); } } /** + * ring_buffer_wake_waiters - wake up any waiters on this ring buffer + * @buffer: The ring buffer to wake waiters on + * + * In the case of a file that represents a ring buffer is closing, + * it is prudent to wake up any waiters that are on this. + */ +void ring_buffer_wake_waiters(struct trace_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *rbwork; + + if (cpu == RING_BUFFER_ALL_CPUS) { + + /* Wake up individual ones too. One level recursion */ + for_each_buffer_cpu(buffer, cpu) + ring_buffer_wake_waiters(buffer, cpu); + + rbwork = &buffer->irq_work; + } else { + cpu_buffer = buffer->buffers[cpu]; + rbwork = &cpu_buffer->irq_work; + } + + rbwork->wait_index++; + /* make sure the waiters see the new index */ + smp_wmb(); + + rb_wake_up_waiters(&rbwork->work); +} + +/** * ring_buffer_wait - wait for input to the ring buffer * @buffer: buffer to wait on * @cpu: the cpu buffer to wait on @@ -938,6 +971,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) struct ring_buffer_per_cpu *cpu_buffer; DEFINE_WAIT(wait); struct rb_irq_work *work; + long wait_index; int ret = 0; /* @@ -956,6 +990,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) work = &cpu_buffer->irq_work; } + wait_index = READ_ONCE(work->wait_index); while (true) { if (full) @@ -1011,7 +1046,7 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) nr_pages = cpu_buffer->nr_pages; dirty = ring_buffer_nr_dirty_pages(buffer, cpu); if (!cpu_buffer->shortest_full || - cpu_buffer->shortest_full < full) + cpu_buffer->shortest_full > full) cpu_buffer->shortest_full = full; raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); if (!pagebusy && @@ -1020,6 +1055,11 @@ int ring_buffer_wait(struct trace_buffer *buffer, int cpu, int full) } schedule(); + + /* Make sure to see the new wait index */ + smp_rmb(); + if (wait_index != work->wait_index) + break; } if (full) @@ -2608,6 +2648,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Mark the rest of the page with padding */ rb_event_set_padding(event); + /* Make sure the padding is visible before the write update */ + smp_wmb(); + /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; @@ -2619,6 +2662,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1; + /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); @@ -4587,6 +4633,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see rb_reset_tail()) + */ + smp_rmb(); + + return reader; } @@ -5232,7 +5305,7 @@ void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu) EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); /** - * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * ring_buffer_reset_online_cpus - reset a ring buffer per CPU buffer * @buffer: The ring buffer to reset a per cpu buffer of * @cpu: The CPU buffer to be reset */ @@ -5302,7 +5375,7 @@ void ring_buffer_reset(struct trace_buffer *buffer) EXPORT_SYMBOL_GPL(ring_buffer_reset); /** - * rind_buffer_empty - is the ring buffer empty? + * ring_buffer_empty - is the ring buffer empty? * @buffer: The ring buffer to test */ bool ring_buffer_empty(struct trace_buffer *buffer) @@ -5616,7 +5689,15 @@ int ring_buffer_read_page(struct trace_buffer *buffer, unsigned int pos = 0; unsigned int size; - if (full) + /* + * If a full page is expected, this can still be returned + * if there's been a previous partial read and the + * rest of the page can be read and the commit page is off + * the reader page. + */ + if (full && + (!read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page)) goto out_unlock; if (len > (commit - read)) diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c index 83cace53b9fa..b2b49a27e886 100644 --- a/kernel/trace/rv/monitors/wip/wip.c +++ b/kernel/trace/rv/monitors/wip/wip.c @@ -16,7 +16,7 @@ #include "wip.h" -struct rv_monitor rv_wip; +static struct rv_monitor rv_wip; DECLARE_DA_MON_PER_CPU(wip, unsigned char); static void handle_preempt_disable(void *data, unsigned long ip, unsigned long parent_ip) @@ -60,7 +60,7 @@ static void disable_wip(void) da_monitor_destroy_wip(); } -struct rv_monitor rv_wip = { +static struct rv_monitor rv_wip = { .name = "wip", .description = "wakeup in preemptive per-cpu testing monitor.", .enable = enable_wip, @@ -69,13 +69,13 @@ struct rv_monitor rv_wip = { .enabled = 0, }; -static int register_wip(void) +static int __init register_wip(void) { rv_register_monitor(&rv_wip); return 0; } -static void unregister_wip(void) +static void __exit unregister_wip(void) { rv_unregister_monitor(&rv_wip); } diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c index 599225d9cf38..0e43dd2db685 100644 --- a/kernel/trace/rv/monitors/wwnr/wwnr.c +++ b/kernel/trace/rv/monitors/wwnr/wwnr.c @@ -15,7 +15,7 @@ #include "wwnr.h" -struct rv_monitor rv_wwnr; +static struct rv_monitor rv_wwnr; DECLARE_DA_MON_PER_TASK(wwnr, unsigned char); static void handle_switch(void *data, bool preempt, struct task_struct *p, @@ -59,7 +59,7 @@ static void disable_wwnr(void) da_monitor_destroy_wwnr(); } -struct rv_monitor rv_wwnr = { +static struct rv_monitor rv_wwnr = { .name = "wwnr", .description = "wakeup while not running per-task testing model.", .enable = enable_wwnr, @@ -68,13 +68,13 @@ struct rv_monitor rv_wwnr = { .enabled = 0, }; -static int register_wwnr(void) +static int __init register_wwnr(void) { rv_register_monitor(&rv_wwnr); return 0; } -static void unregister_wwnr(void) +static void __exit unregister_wwnr(void) { rv_unregister_monitor(&rv_wwnr); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d3005279165d..47a44b055a1d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1193,12 +1193,14 @@ void *tracing_cond_snapshot_data(struct trace_array *tr) { void *cond_data = NULL; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) cond_data = tr->cond_snapshot->cond_data; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return cond_data; } @@ -1334,9 +1336,11 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, goto fail_unlock; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); tr->cond_snapshot = cond_snapshot; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); mutex_unlock(&trace_types_lock); @@ -1363,6 +1367,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) { int ret = 0; + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (!tr->cond_snapshot) @@ -1373,6 +1378,7 @@ int tracing_snapshot_cond_disable(struct trace_array *tr) } arch_spin_unlock(&tr->max_lock); + local_irq_enable(); return ret; } @@ -2200,6 +2206,11 @@ static size_t tgid_map_max; #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX +/* + * Preemption must be disabled before acquiring trace_cmdline_lock. + * The various trace_arrays' max_lock must be acquired in a context + * where interrupt is disabled. + */ static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -2412,7 +2423,11 @@ static int trace_save_cmdline(struct task_struct *tsk) * the lock, but we also don't want to spin * nor do we want to disable interrupts, * so if we miss here, then better luck next time. + * + * This is called within the scheduler and wake up, so interrupts + * had better been disabled and run queue lock been held. */ + lockdep_assert_preemption_disabled(); if (!arch_spin_trylock(&trace_cmdline_lock)) return 0; @@ -5890,9 +5905,11 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf, char buf[64]; int r; + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num); arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); } @@ -5917,10 +5934,12 @@ static int tracing_resize_saved_cmdlines(unsigned int val) return -ENOMEM; } + preempt_disable(); arch_spin_lock(&trace_cmdline_lock); savedcmd_temp = savedcmd; savedcmd = s; arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); free_saved_cmdlines_buffer(savedcmd_temp); return 0; @@ -6373,10 +6392,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) #ifdef CONFIG_TRACER_SNAPSHOT if (t->use_max_tr) { + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; } @@ -6407,12 +6428,12 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) if (tr->current_trace->reset) tr->current_trace->reset(tr); +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->current_trace->use_max_tr; + /* Current trace needs to be nop_trace before synchronize_rcu */ tr->current_trace = &nop_trace; -#ifdef CONFIG_TRACER_MAX_TRACE - had_max_tr = tr->allocated_snapshot; - if (had_max_tr && !t->use_max_tr) { /* * We need to make sure that the update_max_tr sees that @@ -6425,11 +6446,13 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } - if (t->use_max_tr && !had_max_tr) { + if (t->use_max_tr && !tr->allocated_snapshot) { ret = tracing_alloc_snapshot_instance(tr); if (ret < 0) goto out; } +#else + tr->current_trace = &nop_trace; #endif if (t->init) { @@ -7436,10 +7459,12 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, goto out; } + local_irq_disable(); arch_spin_lock(&tr->max_lock); if (tr->cond_snapshot) ret = -EBUSY; arch_spin_unlock(&tr->max_lock); + local_irq_enable(); if (ret) goto out; @@ -8137,6 +8162,12 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + if (info->spare) ring_buffer_free_read_page(iter->array_buffer->buffer, info->spare_cpu, info->spare); @@ -8290,6 +8321,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, /* did we read anything? */ if (!spd.nr_pages) { + long wait_index; + if (ret) goto out; @@ -8297,10 +8330,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; + wait_index = READ_ONCE(iter->wait_index); + ret = wait_on_pipe(iter, iter->tr->buffer_percent); if (ret) goto out; + /* No need to wait after waking up when tracing is off */ + if (!tracer_tracing_is_on(iter->tr)) + goto out; + + /* Make sure we see the new wait_index */ + smp_rmb(); + if (wait_index != iter->wait_index) + goto out; + goto again; } @@ -8311,12 +8355,34 @@ out: return ret; } +/* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */ +static long tracing_buffers_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + if (cmd) + return -ENOIOCTLCMD; + + mutex_lock(&trace_types_lock); + + iter->wait_index++; + /* Make sure the waiters see the new wait_index */ + smp_wmb(); + + ring_buffer_wake_waiters(iter->array_buffer->buffer, iter->cpu_file); + + mutex_unlock(&trace_types_lock); + return 0; +} + static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, + .unlocked_ioctl = tracing_buffers_ioctl, .llseek = no_llseek, }; @@ -9005,6 +9071,8 @@ rb_simple_write(struct file *filp, const char __user *ubuf, tracer_tracing_off(tr); if (tr->current_trace->stop) tr->current_trace->stop(tr); + /* Wake up any waiters */ + ring_buffer_wake_waiters(buffer, RING_BUFFER_ALL_CPUS); } mutex_unlock(&trace_types_lock); } @@ -10091,7 +10159,7 @@ __init static int tracer_alloc_buffers(void) * buffer. The memory will be removed once the "instance" is removed. */ ret = cpuhp_setup_state_multi(CPUHP_TRACE_RB_PREPARE, - "trace/RB:preapre", trace_rb_cpu_prepare, + "trace/RB:prepare", trace_rb_cpu_prepare, NULL); if (ret < 0) goto out_free_cpumask; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 900e75d96c84..54ee5711c729 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1435,8 +1435,6 @@ event_trigger_unlock_commit(struct trace_event_file *file, struct filter_pred; struct regex; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); - typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { @@ -1455,17 +1453,6 @@ struct regex { regex_match_func match; }; -struct filter_pred { - filter_pred_fn_t fn; - u64 val; - struct regex regex; - unsigned short *ops; - struct ftrace_event_field *field; - int offset; - int not; - int op; -}; - static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 801c2a7f7605..54d5fa35c90a 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -51,7 +51,7 @@ static void trace_do_benchmark(void) local_irq_disable(); start = trace_clock_local(); - trace_benchmark_event(bm_str); + trace_benchmark_event(bm_str, bm_last); stop = trace_clock_local(); local_irq_enable(); diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h index 79e6fbe5b365..c3e91060dc94 100644 --- a/kernel/trace/trace_benchmark.h +++ b/kernel/trace/trace_benchmark.h @@ -14,19 +14,21 @@ extern void trace_benchmark_unreg(void); TRACE_EVENT_FN(benchmark_event, - TP_PROTO(const char *str), + TP_PROTO(const char *str, u64 delta), - TP_ARGS(str), + TP_ARGS(str, delta), TP_STRUCT__entry( __array( char, str, BENCHMARK_EVENT_STRLEN ) + __field( u64, delta) ), TP_fast_assign( memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN); + __entry->delta = delta; ), - TP_printk("%s", __entry->str), + TP_printk("%s delta=%llu", __entry->str, __entry->delta), trace_benchmark_reg, trace_benchmark_unreg ); diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index 1783e3478912..5dd0617e5df6 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -16,6 +16,7 @@ #include "trace_dynevent.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define EPROBE_EVENT_SYSTEM "eprobes" @@ -26,6 +27,9 @@ struct trace_eprobe { /* tracepoint event */ const char *event_name; + /* filter string for the tracepoint */ + char *filter_str; + struct trace_event_call *event; struct dyn_event devent; @@ -453,29 +457,14 @@ NOKPROBE_SYMBOL(process_fetch_insn) static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -485,21 +474,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -509,29 +484,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int @@ -664,14 +617,15 @@ static struct event_trigger_data * new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) { struct event_trigger_data *trigger; + struct event_filter *filter = NULL; struct eprobe_data *edata; + int ret; edata = kzalloc(sizeof(*edata), GFP_KERNEL); trigger = kzalloc(sizeof(*trigger), GFP_KERNEL); if (!trigger || !edata) { - kfree(edata); - kfree(trigger); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto error; } trigger->flags = EVENT_TRIGGER_FL_PROBE; @@ -686,13 +640,25 @@ new_eprobe_trigger(struct trace_eprobe *ep, struct trace_event_file *file) trigger->cmd_ops = &event_trigger_cmd; INIT_LIST_HEAD(&trigger->list); - RCU_INIT_POINTER(trigger->filter, NULL); + + if (ep->filter_str) { + ret = create_event_filter(file->tr, file->event_call, + ep->filter_str, false, &filter); + if (ret) + goto error; + } + RCU_INIT_POINTER(trigger->filter, filter); edata->file = file; edata->ep = ep; trigger->private_data = edata; return trigger; +error: + free_event_filter(filter); + kfree(edata); + kfree(trigger); + return ERR_PTR(ret); } static int enable_eprobe(struct trace_eprobe *ep, @@ -726,6 +692,7 @@ static int disable_eprobe(struct trace_eprobe *ep, { struct event_trigger_data *trigger = NULL, *iter; struct trace_event_file *file; + struct event_filter *filter; struct eprobe_data *edata; file = find_event_file(tr, ep->event_system, ep->event_name); @@ -752,6 +719,10 @@ static int disable_eprobe(struct trace_eprobe *ep, /* Make sure nothing is using the edata or trigger */ tracepoint_synchronize_unregister(); + filter = rcu_access_pointer(trigger->filter); + + if (filter) + free_event_filter(filter); kfree(edata); kfree(trigger); @@ -927,12 +898,62 @@ static int trace_eprobe_tp_update_arg(struct trace_eprobe *ep, const char *argv[ return ret; } +static int trace_eprobe_parse_filter(struct trace_eprobe *ep, int argc, const char *argv[]) +{ + struct event_filter *dummy; + int i, ret, len = 0; + char *p; + + if (argc == 0) { + trace_probe_log_err(0, NO_EP_FILTER); + return -EINVAL; + } + + /* Recover the filter string */ + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + + ep->filter_str = kzalloc(len, GFP_KERNEL); + if (!ep->filter_str) + return -ENOMEM; + + p = ep->filter_str; + for (i = 0; i < argc; i++) { + ret = snprintf(p, len, "%s ", argv[i]); + if (ret < 0) + goto error; + if (ret > len) { + ret = -E2BIG; + goto error; + } + p += ret; + len -= ret; + } + p[-1] = '\0'; + + /* + * Ensure the filter string can be parsed correctly. Note, this + * filter string is for the original event, not for the eprobe. + */ + ret = create_event_filter(top_trace_array(), ep->event, ep->filter_str, + true, &dummy); + free_event_filter(dummy); + if (ret) + goto error; + + return 0; +error: + kfree(ep->filter_str); + ep->filter_str = NULL; + return ret; +} + static int __trace_eprobe_create(int argc, const char *argv[]) { /* * Argument syntax: - * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] - * Fetch args: + * e[:[GRP/][ENAME]] SYSTEM.EVENT [FETCHARGS] [if FILTER] + * Fetch args (no space): * <name>=$<field>[:TYPE] */ const char *event = NULL, *group = EPROBE_EVENT_SYSTEM; @@ -942,8 +963,8 @@ static int __trace_eprobe_create(int argc, const char *argv[]) char buf1[MAX_EVENT_NAME_LEN]; char buf2[MAX_EVENT_NAME_LEN]; char gbuf[MAX_EVENT_NAME_LEN]; - int ret = 0; - int i; + int ret = 0, filter_idx = 0; + int i, filter_cnt; if (argc < 2 || argv[0][0] != 'e') return -ECANCELED; @@ -968,11 +989,19 @@ static int __trace_eprobe_create(int argc, const char *argv[]) } if (!event) { - strscpy(buf1, argv[1], MAX_EVENT_NAME_LEN); - sanitize_event_name(buf1); + strscpy(buf1, sys_event, MAX_EVENT_NAME_LEN); event = buf1; } + for (i = 2; i < argc; i++) { + if (!strcmp(argv[i], "if")) { + filter_idx = i + 1; + filter_cnt = argc - filter_idx; + argc = i; + break; + } + } + mutex_lock(&event_mutex); event_call = find_and_get_event(sys_name, sys_event); ep = alloc_event_probe(group, event, event_call, argc - 2); @@ -988,6 +1017,14 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } + if (filter_idx) { + trace_probe_log_set_index(filter_idx); + ret = trace_eprobe_parse_filter(ep, filter_cnt, argv + filter_idx); + if (ret) + goto parse_error; + } else + ep->filter_str = NULL; + argc -= 2; argv += 2; /* parse arguments */ for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 4b1057ab9d96..96acc2b71ac7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -43,6 +43,42 @@ enum filter_op_ids { OPS }; static const char * ops[] = { OPS }; +enum filter_pred_fn { + FILTER_PRED_FN_NOP, + FILTER_PRED_FN_64, + FILTER_PRED_FN_S64, + FILTER_PRED_FN_U64, + FILTER_PRED_FN_32, + FILTER_PRED_FN_S32, + FILTER_PRED_FN_U32, + FILTER_PRED_FN_16, + FILTER_PRED_FN_S16, + FILTER_PRED_FN_U16, + FILTER_PRED_FN_8, + FILTER_PRED_FN_S8, + FILTER_PRED_FN_U8, + FILTER_PRED_FN_COMM, + FILTER_PRED_FN_STRING, + FILTER_PRED_FN_STRLOC, + FILTER_PRED_FN_STRRELLOC, + FILTER_PRED_FN_PCHAR_USER, + FILTER_PRED_FN_PCHAR, + FILTER_PRED_FN_CPU, + FILTER_PRED_FN_, + FILTER_PRED_TEST_VISITED, +}; + +struct filter_pred { + enum filter_pred_fn fn_num; + u64 val; + struct regex regex; + unsigned short *ops; + struct ftrace_event_field *field; + int offset; + int not; + int op; +}; + /* * pred functions are OP_LE, OP_LT, OP_GE, OP_GT, and OP_BAND * pred_funcs_##type below must match the order of them above. @@ -590,44 +626,48 @@ out_free: return ERR_PTR(ret); } +enum pred_cmp_types { + PRED_CMP_TYPE_NOP, + PRED_CMP_TYPE_LT, + PRED_CMP_TYPE_LE, + PRED_CMP_TYPE_GT, + PRED_CMP_TYPE_GE, + PRED_CMP_TYPE_BAND, +}; + #define DEFINE_COMPARISON_PRED(type) \ -static int filter_pred_LT_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr < val; \ -} \ -static int filter_pred_LE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr <= val; \ -} \ -static int filter_pred_GT_##type(struct filter_pred *pred, void *event) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ { \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr > val; \ -} \ -static int filter_pred_GE_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return *addr >= val; \ -} \ -static int filter_pred_BAND_##type(struct filter_pred *pred, void *event) \ -{ \ - type *addr = (type *)(event + pred->offset); \ - type val = (type)pred->val; \ - return !!(*addr & val); \ -} \ -static const filter_pred_fn_t pred_funcs_##type[] = { \ - filter_pred_LE_##type, \ - filter_pred_LT_##type, \ - filter_pred_GE_##type, \ - filter_pred_GT_##type, \ - filter_pred_BAND_##type, \ -}; + switch (pred->op) { \ + case OP_LT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr < val; \ + } \ + case OP_LE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr <= val; \ + } \ + case OP_GT: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr > val; \ + } \ + case OP_GE: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return *addr >= val; \ + } \ + case OP_BAND: { \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + return !!(*addr & val); \ + } \ + default: \ + return 0; \ + } \ +} #define DEFINE_EQUALITY_PRED(size) \ static int filter_pred_##size(struct filter_pred *pred, void *event) \ @@ -836,11 +876,6 @@ static int filter_pred_comm(struct filter_pred *pred, void *event) return cmp ^ pred->not; } -static int filter_pred_none(struct filter_pred *pred, void *event) -{ - return 0; -} - /* * regex_match_foo - Basic regex callbacks * @@ -986,6 +1021,19 @@ static void filter_build_regex(struct filter_pred *pred) } } + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int test_pred_visited_fn(struct filter_pred *pred, void *event); +#else +static int test_pred_visited_fn(struct filter_pred *pred, void *event) +{ + return 0; +} +#endif + + +static int filter_pred_fn_call(struct filter_pred *pred, void *event); + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct event_filter *filter, void *rec) { @@ -1003,7 +1051,7 @@ int filter_match_preds(struct event_filter *filter, void *rec) for (i = 0; prog[i].pred; i++) { struct filter_pred *pred = prog[i].pred; - int match = pred->fn(pred, rec); + int match = filter_pred_fn_call(pred, rec); if (match == prog[i].when_to_branch) i = prog[i].target; } @@ -1189,10 +1237,10 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, - int field_size, int field_is_signed) +static enum filter_pred_fn select_comparison_fn(enum filter_op_ids op, + int field_size, int field_is_signed) { - filter_pred_fn_t fn = NULL; + enum filter_pred_fn fn = FILTER_PRED_FN_NOP; int pred_func_index = -1; switch (op) { @@ -1201,50 +1249,99 @@ static filter_pred_fn_t select_comparison_fn(enum filter_op_ids op, break; default: if (WARN_ON_ONCE(op < PRED_FUNC_START)) - return NULL; + return fn; pred_func_index = op - PRED_FUNC_START; if (WARN_ON_ONCE(pred_func_index > PRED_FUNC_MAX)) - return NULL; + return fn; } switch (field_size) { case 8: if (pred_func_index < 0) - fn = filter_pred_64; + fn = FILTER_PRED_FN_64; else if (field_is_signed) - fn = pred_funcs_s64[pred_func_index]; + fn = FILTER_PRED_FN_S64; else - fn = pred_funcs_u64[pred_func_index]; + fn = FILTER_PRED_FN_U64; break; case 4: if (pred_func_index < 0) - fn = filter_pred_32; + fn = FILTER_PRED_FN_32; else if (field_is_signed) - fn = pred_funcs_s32[pred_func_index]; + fn = FILTER_PRED_FN_S32; else - fn = pred_funcs_u32[pred_func_index]; + fn = FILTER_PRED_FN_U32; break; case 2: if (pred_func_index < 0) - fn = filter_pred_16; + fn = FILTER_PRED_FN_16; else if (field_is_signed) - fn = pred_funcs_s16[pred_func_index]; + fn = FILTER_PRED_FN_S16; else - fn = pred_funcs_u16[pred_func_index]; + fn = FILTER_PRED_FN_U16; break; case 1: if (pred_func_index < 0) - fn = filter_pred_8; + fn = FILTER_PRED_FN_8; else if (field_is_signed) - fn = pred_funcs_s8[pred_func_index]; + fn = FILTER_PRED_FN_S8; else - fn = pred_funcs_u8[pred_func_index]; + fn = FILTER_PRED_FN_U8; break; } return fn; } + +static int filter_pred_fn_call(struct filter_pred *pred, void *event) +{ + switch (pred->fn_num) { + case FILTER_PRED_FN_64: + return filter_pred_64(pred, event); + case FILTER_PRED_FN_S64: + return filter_pred_s64(pred, event); + case FILTER_PRED_FN_U64: + return filter_pred_u64(pred, event); + case FILTER_PRED_FN_32: + return filter_pred_32(pred, event); + case FILTER_PRED_FN_S32: + return filter_pred_s32(pred, event); + case FILTER_PRED_FN_U32: + return filter_pred_u32(pred, event); + case FILTER_PRED_FN_16: + return filter_pred_16(pred, event); + case FILTER_PRED_FN_S16: + return filter_pred_s16(pred, event); + case FILTER_PRED_FN_U16: + return filter_pred_u16(pred, event); + case FILTER_PRED_FN_8: + return filter_pred_8(pred, event); + case FILTER_PRED_FN_S8: + return filter_pred_s8(pred, event); + case FILTER_PRED_FN_U8: + return filter_pred_u8(pred, event); + case FILTER_PRED_FN_COMM: + return filter_pred_comm(pred, event); + case FILTER_PRED_FN_STRING: + return filter_pred_string(pred, event); + case FILTER_PRED_FN_STRLOC: + return filter_pred_strloc(pred, event); + case FILTER_PRED_FN_STRRELLOC: + return filter_pred_strrelloc(pred, event); + case FILTER_PRED_FN_PCHAR_USER: + return filter_pred_pchar_user(pred, event); + case FILTER_PRED_FN_PCHAR: + return filter_pred_pchar(pred, event); + case FILTER_PRED_FN_CPU: + return filter_pred_cpu(pred, event); + case FILTER_PRED_TEST_VISITED: + return test_pred_visited_fn(pred, event); + default: + return 0; + } +} + /* Called when a predicate is encountered by predicate_parse() */ static int parse_pred(const char *str, void *data, int pos, struct filter_parse_error *pe, @@ -1338,7 +1435,7 @@ static int parse_pred(const char *str, void *data, parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); goto err_free; } - pred->fn = filter_pred_none; + pred->fn_num = FILTER_PRED_FN_NOP; /* * Quotes are not required, but if they exist then we need @@ -1416,16 +1513,16 @@ static int parse_pred(const char *str, void *data, filter_build_regex(pred); if (field->filter_type == FILTER_COMM) { - pred->fn = filter_pred_comm; + pred->fn_num = FILTER_PRED_FN_COMM; } else if (field->filter_type == FILTER_STATIC_STRING) { - pred->fn = filter_pred_string; + pred->fn_num = FILTER_PRED_FN_STRING; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - pred->fn = filter_pred_strloc; + pred->fn_num = FILTER_PRED_FN_STRLOC; } else if (field->filter_type == FILTER_RDYN_STRING) - pred->fn = filter_pred_strrelloc; + pred->fn_num = FILTER_PRED_FN_STRRELLOC; else { if (!ustring_per_cpu) { @@ -1436,9 +1533,9 @@ static int parse_pred(const char *str, void *data, } if (ustring) - pred->fn = filter_pred_pchar_user; + pred->fn_num = FILTER_PRED_FN_PCHAR_USER; else - pred->fn = filter_pred_pchar; + pred->fn_num = FILTER_PRED_FN_PCHAR; } /* go past the last quote */ i++; @@ -1486,10 +1583,10 @@ static int parse_pred(const char *str, void *data, pred->val = val; if (field->filter_type == FILTER_CPU) - pred->fn = filter_pred_cpu; + pred->fn_num = FILTER_PRED_FN_CPU; else { - pred->fn = select_comparison_fn(pred->op, field->size, - field->is_signed); + pred->fn_num = select_comparison_fn(pred->op, field->size, + field->is_signed); if (pred->op == OP_NE) pred->not = 1; } @@ -2296,7 +2393,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) struct filter_pred *pred = prog[i].pred; struct ftrace_event_field *field = pred->field; - WARN_ON_ONCE(!pred->fn); + WARN_ON_ONCE(pred->fn_num == FILTER_PRED_FN_NOP); if (!field) { WARN_ONCE(1, "all leafs should have field defined %d", i); @@ -2306,7 +2403,7 @@ static void update_pred_fn(struct event_filter *filter, char *fields) if (!strchr(fields, *field->name)) continue; - pred->fn = test_pred_visited_fn; + pred->fn_num = FILTER_PRED_TEST_VISITED; } } diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index fdf784620c28..48465f7e97b4 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -104,6 +104,38 @@ enum field_op_id { FIELD_OP_MULT, }; +enum hist_field_fn { + HIST_FIELD_FN_NOP, + HIST_FIELD_FN_VAR_REF, + HIST_FIELD_FN_COUNTER, + HIST_FIELD_FN_CONST, + HIST_FIELD_FN_LOG2, + HIST_FIELD_FN_BUCKET, + HIST_FIELD_FN_TIMESTAMP, + HIST_FIELD_FN_CPU, + HIST_FIELD_FN_STRING, + HIST_FIELD_FN_DYNSTRING, + HIST_FIELD_FN_RELDYNSTRING, + HIST_FIELD_FN_PSTRING, + HIST_FIELD_FN_S64, + HIST_FIELD_FN_U64, + HIST_FIELD_FN_S32, + HIST_FIELD_FN_U32, + HIST_FIELD_FN_S16, + HIST_FIELD_FN_U16, + HIST_FIELD_FN_S8, + HIST_FIELD_FN_U8, + HIST_FIELD_FN_UMINUS, + HIST_FIELD_FN_MINUS, + HIST_FIELD_FN_PLUS, + HIST_FIELD_FN_DIV, + HIST_FIELD_FN_MULT, + HIST_FIELD_FN_DIV_POWER2, + HIST_FIELD_FN_DIV_NOT_POWER2, + HIST_FIELD_FN_DIV_MULT_SHIFT, + HIST_FIELD_FN_EXECNAME, +}; + /* * A hist_var (histogram variable) contains variable information for * hist_fields having the HIST_FIELD_FL_VAR or HIST_FIELD_FL_VAR_REF @@ -123,15 +155,15 @@ struct hist_var { struct hist_field { struct ftrace_event_field *field; unsigned long flags; - hist_field_fn_t fn; - unsigned int ref; - unsigned int size; - unsigned int offset; - unsigned int is_signed; unsigned long buckets; const char *type; struct hist_field *operands[HIST_FIELD_OPERANDS_MAX]; struct hist_trigger_data *hist_data; + enum hist_field_fn fn_num; + unsigned int ref; + unsigned int size; + unsigned int offset; + unsigned int is_signed; /* * Variable fields contain variable-specific info in var. @@ -166,14 +198,11 @@ struct hist_field { u64 div_multiplier; }; -static u64 hist_field_none(struct hist_field *field, - struct tracing_map_elt *elt, - struct trace_buffer *buffer, - struct ring_buffer_event *rbe, - void *event) -{ - return 0; -} +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event); static u64 hist_field_const(struct hist_field *field, struct tracing_map_elt *elt, @@ -250,7 +279,7 @@ static u64 hist_field_log2(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); return (u64) ilog2(roundup_pow_of_two(val)); } @@ -264,7 +293,7 @@ static u64 hist_field_bucket(struct hist_field *hist_field, struct hist_field *operand = hist_field->operands[0]; unsigned long buckets = hist_field->buckets; - u64 val = operand->fn(operand, elt, buffer, rbe, event); + u64 val = hist_fn_call(operand, elt, buffer, rbe, event); if (WARN_ON_ONCE(!buckets)) return val; @@ -285,8 +314,8 @@ static u64 hist_field_plus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 + val2; } @@ -300,8 +329,8 @@ static u64 hist_field_minus(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 - val2; } @@ -315,8 +344,8 @@ static u64 hist_field_div(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); /* Return -1 for the undefined case */ if (!val2) @@ -338,7 +367,7 @@ static u64 div_by_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return val1 >> __ffs64(operand2->constant); } @@ -352,7 +381,7 @@ static u64 div_by_not_power_of_two(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); return div64_u64(val1, operand2->constant); } @@ -366,7 +395,7 @@ static u64 div_by_mult_and_shift(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); /* * If the divisor is a constant, do a multiplication and shift instead. @@ -400,8 +429,8 @@ static u64 hist_field_mult(struct hist_field *hist_field, struct hist_field *operand1 = hist_field->operands[0]; struct hist_field *operand2 = hist_field->operands[1]; - u64 val1 = operand1->fn(operand1, elt, buffer, rbe, event); - u64 val2 = operand2->fn(operand2, elt, buffer, rbe, event); + u64 val1 = hist_fn_call(operand1, elt, buffer, rbe, event); + u64 val2 = hist_fn_call(operand2, elt, buffer, rbe, event); return val1 * val2; } @@ -414,7 +443,7 @@ static u64 hist_field_unary_minus(struct hist_field *hist_field, { struct hist_field *operand = hist_field->operands[0]; - s64 sval = (s64)operand->fn(operand, elt, buffer, rbe, event); + s64 sval = (s64)hist_fn_call(operand, elt, buffer, rbe, event); u64 val = (u64)-sval; return val; @@ -657,19 +686,19 @@ struct snapshot_context { * Returns the specific division function to use if the divisor * is constant. This avoids extra branches when the trigger is hit. */ -static hist_field_fn_t hist_field_get_div_fn(struct hist_field *divisor) +static enum hist_field_fn hist_field_get_div_fn(struct hist_field *divisor) { u64 div = divisor->constant; if (!(div & (div - 1))) - return div_by_power_of_two; + return HIST_FIELD_FN_DIV_POWER2; /* If the divisor is too large, do a regular division */ if (div > (1 << HIST_DIV_SHIFT)) - return div_by_not_power_of_two; + return HIST_FIELD_FN_DIV_NOT_POWER2; divisor->div_multiplier = div64_u64((u64)(1 << HIST_DIV_SHIFT), div); - return div_by_mult_and_shift; + return HIST_FIELD_FN_DIV_MULT_SHIFT; } static void track_data_free(struct track_data *track_data) @@ -1334,38 +1363,32 @@ static const char *hist_field_name(struct hist_field *field, return field_name; } -static hist_field_fn_t select_value_fn(int field_size, int field_is_signed) +static enum hist_field_fn select_value_fn(int field_size, int field_is_signed) { - hist_field_fn_t fn = NULL; - switch (field_size) { case 8: if (field_is_signed) - fn = hist_field_s64; + return HIST_FIELD_FN_S64; else - fn = hist_field_u64; - break; + return HIST_FIELD_FN_U64; case 4: if (field_is_signed) - fn = hist_field_s32; + return HIST_FIELD_FN_S32; else - fn = hist_field_u32; - break; + return HIST_FIELD_FN_U32; case 2: if (field_is_signed) - fn = hist_field_s16; + return HIST_FIELD_FN_S16; else - fn = hist_field_u16; - break; + return HIST_FIELD_FN_U16; case 1: if (field_is_signed) - fn = hist_field_s8; + return HIST_FIELD_FN_S8; else - fn = hist_field_u8; - break; + return HIST_FIELD_FN_U8; } - return fn; + return HIST_FIELD_FN_NOP; } static int parse_map_size(char *str) @@ -1922,19 +1945,19 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto out; /* caller will populate */ if (flags & HIST_FIELD_FL_VAR_REF) { - hist_field->fn = hist_field_var_ref; + hist_field->fn_num = HIST_FIELD_FN_VAR_REF; goto out; } if (flags & HIST_FIELD_FL_HITCOUNT) { - hist_field->fn = hist_field_counter; + hist_field->fn_num = HIST_FIELD_FN_COUNTER; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CONST) { - hist_field->fn = hist_field_const; + hist_field->fn_num = HIST_FIELD_FN_CONST; hist_field->size = sizeof(u64); hist_field->type = kstrdup("u64", GFP_KERNEL); if (!hist_field->type) @@ -1943,14 +1966,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_STACKTRACE) { - hist_field->fn = hist_field_none; + hist_field->fn_num = HIST_FIELD_FN_NOP; goto out; } if (flags & (HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET)) { unsigned long fl = flags & ~(HIST_FIELD_FL_LOG2 | HIST_FIELD_FL_BUCKET); - hist_field->fn = flags & HIST_FIELD_FL_LOG2 ? hist_field_log2 : - hist_field_bucket; + hist_field->fn_num = flags & HIST_FIELD_FL_LOG2 ? HIST_FIELD_FN_LOG2 : + HIST_FIELD_FN_BUCKET; hist_field->operands[0] = create_hist_field(hist_data, field, fl, NULL); hist_field->size = hist_field->operands[0]->size; hist_field->type = kstrdup_const(hist_field->operands[0]->type, GFP_KERNEL); @@ -1960,14 +1983,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, } if (flags & HIST_FIELD_FL_TIMESTAMP) { - hist_field->fn = hist_field_timestamp; + hist_field->fn_num = HIST_FIELD_FN_TIMESTAMP; hist_field->size = sizeof(u64); hist_field->type = "u64"; goto out; } if (flags & HIST_FIELD_FL_CPU) { - hist_field->fn = hist_field_cpu; + hist_field->fn_num = HIST_FIELD_FN_CPU; hist_field->size = sizeof(int); hist_field->type = "unsigned int"; goto out; @@ -1987,14 +2010,14 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, goto free; if (field->filter_type == FILTER_STATIC_STRING) { - hist_field->fn = hist_field_string; + hist_field->fn_num = HIST_FIELD_FN_STRING; hist_field->size = field->size; } else if (field->filter_type == FILTER_DYN_STRING) { - hist_field->fn = hist_field_dynstring; + hist_field->fn_num = HIST_FIELD_FN_DYNSTRING; } else if (field->filter_type == FILTER_RDYN_STRING) - hist_field->fn = hist_field_reldynstring; + hist_field->fn_num = HIST_FIELD_FN_RELDYNSTRING; else - hist_field->fn = hist_field_pstring; + hist_field->fn_num = HIST_FIELD_FN_PSTRING; } else { hist_field->size = field->size; hist_field->is_signed = field->is_signed; @@ -2002,9 +2025,9 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field->type) goto free; - hist_field->fn = select_value_fn(field->size, - field->is_signed); - if (!hist_field->fn) { + hist_field->fn_num = select_value_fn(field->size, + field->is_signed); + if (hist_field->fn_num == HIST_FIELD_FN_NOP) { destroy_hist_field(hist_field, 0); return NULL; } @@ -2340,7 +2363,7 @@ static struct hist_field *create_alias(struct hist_trigger_data *hist_data, if (!alias) return NULL; - alias->fn = var_ref->fn; + alias->fn_num = var_ref->fn_num; alias->operands[0] = var_ref; if (init_var_ref(alias, var_ref, var_ref->system, var_ref->event_name)) { @@ -2523,7 +2546,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, expr->flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); - expr->fn = hist_field_unary_minus; + expr->fn_num = HIST_FIELD_FN_UMINUS; expr->operands[0] = operand1; expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -2595,7 +2618,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, unsigned long operand_flags, operand2_flags; int field_op, ret = -EINVAL; char *sep, *operand1_str; - hist_field_fn_t op_fn; + enum hist_field_fn op_fn; bool combine_consts; if (*n_subexprs > 3) { @@ -2654,16 +2677,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, switch (field_op) { case FIELD_OP_MINUS: - op_fn = hist_field_minus; + op_fn = HIST_FIELD_FN_MINUS; break; case FIELD_OP_PLUS: - op_fn = hist_field_plus; + op_fn = HIST_FIELD_FN_PLUS; break; case FIELD_OP_DIV: - op_fn = hist_field_div; + op_fn = HIST_FIELD_FN_DIV; break; case FIELD_OP_MULT: - op_fn = hist_field_mult; + op_fn = HIST_FIELD_FN_MULT; break; default: ret = -EINVAL; @@ -2719,13 +2742,16 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, op_fn = hist_field_get_div_fn(operand2); } + expr->fn_num = op_fn; + if (combine_consts) { if (var1) expr->operands[0] = var1; if (var2) expr->operands[1] = var2; - expr->constant = op_fn(expr, NULL, NULL, NULL, NULL); + expr->constant = hist_fn_call(expr, NULL, NULL, NULL, NULL); + expr->fn_num = HIST_FIELD_FN_CONST; expr->operands[0] = NULL; expr->operands[1] = NULL; @@ -2739,8 +2765,6 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, expr->name = expr_str(expr, 0); } else { - expr->fn = op_fn; - /* The operand sizes should be the same, so just pick one */ expr->size = operand1->size; expr->is_signed = operand1->is_signed; @@ -3065,7 +3089,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, struct hist_field *var = field_var->var; struct hist_field *val = field_var->val; - var_val = val->fn(val, elt, buffer, rbe, rec); + var_val = hist_fn_call(val, elt, buffer, rbe, rec); var_idx = var->var.idx; if (val->flags & HIST_FIELD_FL_STRING) { @@ -4186,6 +4210,74 @@ static u64 hist_field_execname(struct hist_field *hist_field, return (u64)(unsigned long)(elt_data->comm); } +static u64 hist_fn_call(struct hist_field *hist_field, + struct tracing_map_elt *elt, + struct trace_buffer *buffer, + struct ring_buffer_event *rbe, + void *event) +{ + switch (hist_field->fn_num) { + case HIST_FIELD_FN_VAR_REF: + return hist_field_var_ref(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_COUNTER: + return hist_field_counter(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CONST: + return hist_field_const(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_LOG2: + return hist_field_log2(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_BUCKET: + return hist_field_bucket(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_TIMESTAMP: + return hist_field_timestamp(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_CPU: + return hist_field_cpu(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_STRING: + return hist_field_string(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DYNSTRING: + return hist_field_dynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_RELDYNSTRING: + return hist_field_reldynstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PSTRING: + return hist_field_pstring(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S64: + return hist_field_s64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U64: + return hist_field_u64(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S32: + return hist_field_s32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U32: + return hist_field_u32(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S16: + return hist_field_s16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U16: + return hist_field_u16(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_S8: + return hist_field_s8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_U8: + return hist_field_u8(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_UMINUS: + return hist_field_unary_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MINUS: + return hist_field_minus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_PLUS: + return hist_field_plus(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV: + return hist_field_div(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_MULT: + return hist_field_mult(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_POWER2: + return div_by_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_NOT_POWER2: + return div_by_not_power_of_two(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_DIV_MULT_SHIFT: + return div_by_mult_and_shift(hist_field, elt, buffer, rbe, event); + case HIST_FIELD_FN_EXECNAME: + return hist_field_execname(hist_field, elt, buffer, rbe, event); + default: + return 0; + } +} + /* Convert a var that points to common_pid.execname to a string */ static void update_var_execname(struct hist_field *hist_field) { @@ -4197,7 +4289,7 @@ static void update_var_execname(struct hist_field *hist_field) kfree_const(hist_field->type); hist_field->type = "char[]"; - hist_field->fn = hist_field_execname; + hist_field->fn_num = HIST_FIELD_FN_EXECNAME; } static int create_var_field(struct hist_trigger_data *hist_data, @@ -4956,7 +5048,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_val_field(i, hist_data) { hist_field = hist_data->fields[i]; - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); if (hist_field->flags & HIST_FIELD_FL_VAR) { var_idx = hist_field->var.idx; @@ -4987,7 +5079,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, for_each_hist_key_field(i, hist_data) { hist_field = hist_data->fields[i]; if (hist_field->flags & HIST_FIELD_FL_VAR) { - hist_val = hist_field->fn(hist_field, elt, buffer, rbe, rec); + hist_val = hist_fn_call(hist_field, elt, buffer, rbe, rec); var_idx = hist_field->var.idx; tracing_map_set_var(elt, var_idx, hist_val); } @@ -5062,7 +5154,7 @@ static void event_hist_trigger(struct event_trigger_data *data, HIST_STACKTRACE_SKIP); key = entries; } else { - field_contents = key_field->fn(key_field, elt, buffer, rbe, rec); + field_contents = hist_fn_call(key_field, elt, buffer, rbe, rec); if (key_field->flags & HIST_FIELD_FL_STRING) { key = (void *)(unsigned long)field_contents; use_compound_key = true; diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 5e8c07aef071..e310052dc83c 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -17,6 +17,8 @@ /* for gfp flag names */ #include <linux/trace_events.h> #include <trace/events/mmflags.h> +#include "trace_probe.h" +#include "trace_probe_kernel.h" #include "trace_synth.h" @@ -409,6 +411,7 @@ static unsigned int trace_string(struct synth_trace_event *entry, { unsigned int len = 0; char *str_field; + int ret; if (is_dynamic) { u32 data_offset; @@ -417,19 +420,27 @@ static unsigned int trace_string(struct synth_trace_event *entry, data_offset += event->n_u64 * sizeof(u64); data_offset += data_size; - str_field = (char *)entry + data_offset; - - len = strlen(str_val) + 1; - strscpy(str_field, str_val, len); + len = kern_fetch_store_strlen((unsigned long)str_val); data_offset |= len << 16; *(u32 *)&entry->fields[*n_u64] = data_offset; + ret = kern_fetch_store_string((unsigned long)str_val, &entry->fields[*n_u64], entry); + (*n_u64)++; } else { str_field = (char *)&entry->fields[*n_u64]; - strscpy(str_field, str_val, STR_VAR_LEN_MAX); +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)str_val < TASK_SIZE) + ret = strncpy_from_user_nofault(str_field, str_val, STR_VAR_LEN_MAX); + else +#endif + ret = strncpy_from_kernel_nofault(str_field, str_val, STR_VAR_LEN_MAX); + + if (ret < 0) + strcpy(str_field, FAULT_STRING); + (*n_u64) += STR_VAR_LEN_MAX / sizeof(u64); } @@ -462,7 +473,7 @@ static notrace void trace_event_raw_event_synth(void *__data, val_idx = var_ref_idx[field_pos]; str_val = (char *)(long)var_ref_vals[val_idx]; - len = strlen(str_val) + 1; + len = kern_fetch_store_strlen((unsigned long)str_val); fields_size += len; } diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index a6621c52ce45..ae78c2d53c8a 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -14,6 +14,7 @@ #include <linux/uio.h> #include <linux/ioctl.h> #include <linux/jhash.h> +#include <linux/refcount.h> #include <linux/trace_events.h> #include <linux/tracefs.h> #include <linux/types.h> @@ -39,28 +40,69 @@ */ #define MAX_PAGE_ORDER 0 #define MAX_PAGES (1 << MAX_PAGE_ORDER) -#define MAX_EVENTS (MAX_PAGES * PAGE_SIZE) +#define MAX_BYTES (MAX_PAGES * PAGE_SIZE) +#define MAX_EVENTS (MAX_BYTES * 8) /* Limit how long of an event name plus args within the subsystem. */ #define MAX_EVENT_DESC 512 #define EVENT_NAME(user_event) ((user_event)->tracepoint.name) #define MAX_FIELD_ARRAY_SIZE 1024 -#define MAX_FIELD_ARG_NAME 256 -static char *register_page_data; +/* + * The MAP_STATUS_* macros are used for taking a index and determining the + * appropriate byte and the bit in the byte to set/reset for an event. + * + * The lower 3 bits of the index decide which bit to set. + * The remaining upper bits of the index decide which byte to use for the bit. + * + * This is used when an event has a probe attached/removed to reflect live + * status of the event wanting tracing or not to user-programs via shared + * memory maps. + */ +#define MAP_STATUS_BYTE(index) ((index) >> 3) +#define MAP_STATUS_MASK(index) BIT((index) & 7) + +/* + * Internal bits (kernel side only) to keep track of connected probes: + * These are used when status is requested in text form about an event. These + * bits are compared against an internal byte on the event to determine which + * probes to print out to the user. + * + * These do not reflect the mapped bytes between the user and kernel space. + */ +#define EVENT_STATUS_FTRACE BIT(0) +#define EVENT_STATUS_PERF BIT(1) +#define EVENT_STATUS_OTHER BIT(7) + +/* + * Stores the pages, tables, and locks for a group of events. + * Each logical grouping of events has its own group, with a + * matching page for status checks within user programs. This + * allows for isolation of events to user programs by various + * means. + */ +struct user_event_group { + struct page *pages; + char *register_page_data; + char *system_name; + struct hlist_node node; + struct mutex reg_mutex; + DECLARE_HASHTABLE(register_table, 8); + DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +}; -static DEFINE_MUTEX(reg_mutex); -static DEFINE_HASHTABLE(register_table, 4); -static DECLARE_BITMAP(page_bitmap, MAX_EVENTS); +/* Group for init_user_ns mapping, top-most group */ +static struct user_event_group *init_group; /* * Stores per-event properties, as users register events * within a file a user_event might be created if it does not * already exist. These are globally used and their lifetime * is tied to the refcnt member. These cannot go away until the - * refcnt reaches zero. + * refcnt reaches one. */ struct user_event { + struct user_event_group *group; struct tracepoint tracepoint; struct trace_event_call call; struct trace_event_class class; @@ -68,10 +110,11 @@ struct user_event { struct hlist_node node; struct list_head fields; struct list_head validators; - atomic_t refcnt; + refcount_t refcnt; int index; int flags; int min_size; + char status; }; /* @@ -86,6 +129,11 @@ struct user_event_refs { struct user_event *events[]; }; +struct user_event_file_info { + struct user_event_group *group; + struct user_event_refs *refs; +}; + #define VALIDATOR_ENSURE_NULL (1 << 0) #define VALIDATOR_REL (1 << 1) @@ -98,7 +146,8 @@ struct user_event_validator { typedef void (*user_event_func_t) (struct user_event *user, struct iov_iter *i, void *tpdata, bool *faulted); -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser); static u32 user_event_key(char *name) @@ -106,6 +155,144 @@ static u32 user_event_key(char *name) return jhash(name, strlen(name), 0); } +static void set_page_reservations(char *pages, bool set) +{ + int page; + + for (page = 0; page < MAX_PAGES; ++page) { + void *addr = pages + (PAGE_SIZE * page); + + if (set) + SetPageReserved(virt_to_page(addr)); + else + ClearPageReserved(virt_to_page(addr)); + } +} + +static void user_event_group_destroy(struct user_event_group *group) +{ + if (group->register_page_data) + set_page_reservations(group->register_page_data, false); + + if (group->pages) + __free_pages(group->pages, MAX_PAGE_ORDER); + + kfree(group->system_name); + kfree(group); +} + +static char *user_event_group_system_name(struct user_namespace *user_ns) +{ + char *system_name; + int len = sizeof(USER_EVENTS_SYSTEM) + 1; + + if (user_ns != &init_user_ns) { + /* + * Unexpected at this point: + * We only currently support init_user_ns. + * When we enable more, this will trigger a failure so log. + */ + pr_warn("user_events: Namespace other than init_user_ns!\n"); + return NULL; + } + + system_name = kmalloc(len, GFP_KERNEL); + + if (!system_name) + return NULL; + + snprintf(system_name, len, "%s", USER_EVENTS_SYSTEM); + + return system_name; +} + +static inline struct user_event_group +*user_event_group_from_user_ns(struct user_namespace *user_ns) +{ + if (user_ns == &init_user_ns) + return init_group; + + return NULL; +} + +static struct user_event_group *current_user_event_group(void) +{ + struct user_namespace *user_ns = current_user_ns(); + struct user_event_group *group = NULL; + + while (user_ns) { + group = user_event_group_from_user_ns(user_ns); + + if (group) + break; + + user_ns = user_ns->parent; + } + + return group; +} + +static struct user_event_group +*user_event_group_create(struct user_namespace *user_ns) +{ + struct user_event_group *group; + + group = kzalloc(sizeof(*group), GFP_KERNEL); + + if (!group) + return NULL; + + group->system_name = user_event_group_system_name(user_ns); + + if (!group->system_name) + goto error; + + group->pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); + + if (!group->pages) + goto error; + + group->register_page_data = page_address(group->pages); + + set_page_reservations(group->register_page_data, true); + + /* Zero all bits beside 0 (which is reserved for failures) */ + bitmap_zero(group->page_bitmap, MAX_EVENTS); + set_bit(0, group->page_bitmap); + + mutex_init(&group->reg_mutex); + hash_init(group->register_table); + + return group; +error: + if (group) + user_event_group_destroy(group); + + return NULL; +}; + +static __always_inline +void user_event_register_set(struct user_event *user) +{ + int i = user->index; + + user->group->register_page_data[MAP_STATUS_BYTE(i)] |= MAP_STATUS_MASK(i); +} + +static __always_inline +void user_event_register_clear(struct user_event *user) +{ + int i = user->index; + + user->group->register_page_data[MAP_STATUS_BYTE(i)] &= ~MAP_STATUS_MASK(i); +} + +static __always_inline __must_check +bool user_event_last_ref(struct user_event *user) +{ + return refcount_read(&user->refcnt) == 1; +} + static __always_inline __must_check size_t copy_nofault(void *addr, size_t bytes, struct iov_iter *i) { @@ -141,7 +328,8 @@ static struct list_head *user_event_get_fields(struct trace_event_call *call) * * Upon success user_event has its ref count increased by 1. */ -static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) +static int user_event_parse_cmd(struct user_event_group *group, + char *raw_command, struct user_event **newuser) { char *name = raw_command; char *args = strpbrk(name, " "); @@ -155,7 +343,7 @@ static int user_event_parse_cmd(char *raw_command, struct user_event **newuser) if (flags) *flags++ = '\0'; - return user_event_parse(name, args, flags, newuser); + return user_event_parse(group, name, args, flags, newuser); } static int user_field_array_size(const char *type) @@ -277,7 +465,7 @@ static int user_event_add_field(struct user_event *user, const char *type, goto add_field; add_validator: - if (strstr(type, "char") != 0) + if (strstr(type, "char") != NULL) validator_flags |= VALIDATOR_ENSURE_NULL; validator = kmalloc(sizeof(*validator), GFP_KERNEL); @@ -458,7 +646,7 @@ static const char *user_field_format(const char *type) return "%d"; if (strcmp(type, "unsigned char") == 0) return "%u"; - if (strstr(type, "char[") != 0) + if (strstr(type, "char[") != NULL) return "%s"; /* Unknown, likely struct, allowed treat as 64-bit */ @@ -479,10 +667,52 @@ static bool user_field_is_dyn_string(const char *type, const char **str_func) return false; check: - return strstr(type, "char") != 0; + return strstr(type, "char") != NULL; } #define LEN_OR_ZERO (len ? len - pos : 0) +static int user_dyn_field_set_string(int argc, const char **argv, int *iout, + char *buf, int len, bool *colon) +{ + int pos = 0, i = *iout; + + *colon = false; + + for (; i < argc; ++i) { + if (i != *iout) + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", argv[i]); + + if (strchr(argv[i], ';')) { + ++i; + *colon = true; + break; + } + } + + /* Actual set, advance i */ + if (len != 0) + *iout = i; + + return pos + 1; +} + +static int user_field_set_string(struct ftrace_event_field *field, + char *buf, int len, bool colon) +{ + int pos = 0; + + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->type); + pos += snprintf(buf + pos, LEN_OR_ZERO, " "); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s", field->name); + + if (colon) + pos += snprintf(buf + pos, LEN_OR_ZERO, ";"); + + return pos + 1; +} + static int user_event_set_print_fmt(struct user_event *user, char *buf, int len) { struct ftrace_event_field *field, *next; @@ -600,8 +830,8 @@ static int destroy_user_event(struct user_event *user) dyn_event_remove(&user->devent); - register_page_data[user->index] = 0; - clear_bit(user->index, page_bitmap); + user_event_register_clear(user); + clear_bit(user->index, user->group->page_bitmap); hash_del(&user->node); user_event_destroy_validators(user); @@ -612,16 +842,17 @@ static int destroy_user_event(struct user_event *user) return ret; } -static struct user_event *find_user_event(char *name, u32 *outkey) +static struct user_event *find_user_event(struct user_event_group *group, + char *name, u32 *outkey) { struct user_event *user; u32 key = user_event_key(name); *outkey = key; - hash_for_each_possible(register_table, user, node, key) + hash_for_each_possible(group->register_table, user, node, key) if (!strcmp(EVENT_NAME(user), name)) { - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); return user; } @@ -779,7 +1010,12 @@ static void update_reg_page_for(struct user_event *user) rcu_read_unlock_sched(); } - register_page_data[user->index] = status; + if (status) + user_event_register_set(user); + else + user_event_register_clear(user); + + user->status = status; } /* @@ -835,17 +1071,18 @@ static int user_event_reg(struct trace_event_call *call, return ret; inc: - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); update_reg_page_for(user); return 0; dec: update_reg_page_for(user); - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); return 0; } static int user_event_create(const char *raw_command) { + struct user_event_group *group; struct user_event *user; char *name; int ret; @@ -861,14 +1098,19 @@ static int user_event_create(const char *raw_command) if (!name) return -ENOMEM; - mutex_lock(®_mutex); + group = current_user_event_group(); - ret = user_event_parse_cmd(name, &user); + if (!group) + return -ENOENT; + + mutex_lock(&group->reg_mutex); + + ret = user_event_parse_cmd(group, name, &user); if (!ret) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); if (ret) kfree(name); @@ -910,14 +1152,14 @@ static bool user_event_is_busy(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - return atomic_read(&user->refcnt) != 0; + return !user_event_last_ref(user); } static int user_event_free(struct dyn_event *ev) { struct user_event *user = container_of(ev, struct user_event, devent); - if (atomic_read(&user->refcnt) != 0) + if (!user_event_last_ref(user)) return -EBUSY; return destroy_user_event(user); @@ -926,49 +1168,35 @@ static int user_event_free(struct dyn_event *ev) static bool user_field_match(struct ftrace_event_field *field, int argc, const char **argv, int *iout) { - char *field_name, *arg_name; - int len, pos, i = *iout; + char *field_name = NULL, *dyn_field_name = NULL; bool colon = false, match = false; + int dyn_len, len; - if (i >= argc) + if (*iout >= argc) return false; - len = MAX_FIELD_ARG_NAME; - field_name = kmalloc(len, GFP_KERNEL); - arg_name = kmalloc(len, GFP_KERNEL); - - if (!arg_name || !field_name) - goto out; - - pos = 0; - - for (; i < argc; ++i) { - if (i != *iout) - pos += snprintf(arg_name + pos, len - pos, " "); + dyn_len = user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + 0, &colon); - pos += snprintf(arg_name + pos, len - pos, argv[i]); + len = user_field_set_string(field, field_name, 0, colon); - if (strchr(argv[i], ';')) { - ++i; - colon = true; - break; - } - } + if (dyn_len != len) + return false; - pos = 0; + dyn_field_name = kmalloc(dyn_len, GFP_KERNEL); + field_name = kmalloc(len, GFP_KERNEL); - pos += snprintf(field_name + pos, len - pos, field->type); - pos += snprintf(field_name + pos, len - pos, " "); - pos += snprintf(field_name + pos, len - pos, field->name); + if (!dyn_field_name || !field_name) + goto out; - if (colon) - pos += snprintf(field_name + pos, len - pos, ";"); + user_dyn_field_set_string(argc, argv, iout, dyn_field_name, + dyn_len, &colon); - *iout = i; + user_field_set_string(field, field_name, len, colon); - match = strcmp(arg_name, field_name) == 0; + match = strcmp(dyn_field_name, field_name) == 0; out: - kfree(arg_name); + kfree(dyn_field_name); kfree(field_name); return match; @@ -1036,7 +1264,8 @@ static int user_event_trace_register(struct user_event *user) * The name buffer lifetime is owned by this method for success cases only. * Upon success the returned user_event has its ref count increased by 1. */ -static int user_event_parse(char *name, char *args, char *flags, +static int user_event_parse(struct user_event_group *group, char *name, + char *args, char *flags, struct user_event **newuser) { int ret; @@ -1046,7 +1275,7 @@ static int user_event_parse(char *name, char *args, char *flags, /* Prevent dyn_event from racing */ mutex_lock(&event_mutex); - user = find_user_event(name, &key); + user = find_user_event(group, name, &key); mutex_unlock(&event_mutex); if (user) { @@ -1059,7 +1288,7 @@ static int user_event_parse(char *name, char *args, char *flags, return 0; } - index = find_first_zero_bit(page_bitmap, MAX_EVENTS); + index = find_first_zero_bit(group->page_bitmap, MAX_EVENTS); if (index == MAX_EVENTS) return -EMFILE; @@ -1073,6 +1302,7 @@ static int user_event_parse(char *name, char *args, char *flags, INIT_LIST_HEAD(&user->fields); INIT_LIST_HEAD(&user->validators); + user->group = group; user->tracepoint.name = name; ret = user_event_parse_fields(user, args); @@ -1091,8 +1321,8 @@ static int user_event_parse(char *name, char *args, char *flags, user->call.flags = TRACE_EVENT_FL_TRACEPOINT; user->call.tp = &user->tracepoint; user->call.event.funcs = &user_event_funcs; + user->class.system = group->system_name; - user->class.system = USER_EVENTS_SYSTEM; user->class.fields_array = user_event_fields_array; user->class.get_fields = user_event_get_fields; user->class.reg = user_event_reg; @@ -1110,13 +1340,13 @@ static int user_event_parse(char *name, char *args, char *flags, user->index = index; - /* Ensure we track ref */ - atomic_inc(&user->refcnt); + /* Ensure we track self ref and caller ref (2) */ + refcount_set(&user->refcnt, 2); dyn_event_init(&user->devent, &user_event_dops); dyn_event_add(&user->devent, &user->call); - set_bit(user->index, page_bitmap); - hash_add(register_table, &user->node, key); + set_bit(user->index, group->page_bitmap); + hash_add(group->register_table, &user->node, key); mutex_unlock(&event_mutex); @@ -1134,32 +1364,20 @@ put_user: /* * Deletes a previously created event if it is no longer being used. */ -static int delete_user_event(char *name) +static int delete_user_event(struct user_event_group *group, char *name) { u32 key; - int ret; - struct user_event *user = find_user_event(name, &key); + struct user_event *user = find_user_event(group, name, &key); if (!user) return -ENOENT; - /* Ensure we are the last ref */ - if (atomic_read(&user->refcnt) != 1) { - ret = -EBUSY; - goto put_ref; - } - - ret = destroy_user_event(user); - - if (ret) - goto put_ref; + refcount_dec(&user->refcnt); - return ret; -put_ref: - /* No longer have this ref */ - atomic_dec(&user->refcnt); + if (!user_event_last_ref(user)) + return -EBUSY; - return ret; + return destroy_user_event(user); } /* @@ -1167,6 +1385,7 @@ put_ref: */ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) { + struct user_event_file_info *info = file->private_data; struct user_event_refs *refs; struct user_event *user = NULL; struct tracepoint *tp; @@ -1178,7 +1397,7 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) rcu_read_lock_sched(); - refs = rcu_dereference_sched(file->private_data); + refs = rcu_dereference_sched(info->refs); /* * The refs->events array is protected by RCU, and new items may be @@ -1236,6 +1455,28 @@ static ssize_t user_events_write_core(struct file *file, struct iov_iter *i) return ret; } +static int user_events_open(struct inode *node, struct file *file) +{ + struct user_event_group *group; + struct user_event_file_info *info; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return -ENOMEM; + + info->group = group; + + file->private_data = info; + + return 0; +} + static ssize_t user_events_write(struct file *file, const char __user *ubuf, size_t count, loff_t *ppos) { @@ -1245,7 +1486,8 @@ static ssize_t user_events_write(struct file *file, const char __user *ubuf, if (unlikely(*ppos != 0)) return -EFAULT; - if (unlikely(import_single_range(READ, (char *)ubuf, count, &iov, &i))) + if (unlikely(import_single_range(WRITE, (char __user *)ubuf, + count, &iov, &i))) return -EFAULT; return user_events_write_core(file, &i); @@ -1256,13 +1498,15 @@ static ssize_t user_events_write_iter(struct kiocb *kp, struct iov_iter *i) return user_events_write_core(kp->ki_filp, i); } -static int user_events_ref_add(struct file *file, struct user_event *user) +static int user_events_ref_add(struct user_event_file_info *info, + struct user_event *user) { + struct user_event_group *group = info->group; struct user_event_refs *refs, *new_refs; int i, size, count = 0; - refs = rcu_dereference_protected(file->private_data, - lockdep_is_held(®_mutex)); + refs = rcu_dereference_protected(info->refs, + lockdep_is_held(&group->reg_mutex)); if (refs) { count = refs->count; @@ -1286,9 +1530,9 @@ static int user_events_ref_add(struct file *file, struct user_event *user) new_refs->events[i] = user; - atomic_inc(&user->refcnt); + refcount_inc(&user->refcnt); - rcu_assign_pointer(file->private_data, new_refs); + rcu_assign_pointer(info->refs, new_refs); if (refs) kfree_rcu(refs, rcu); @@ -1309,13 +1553,24 @@ static long user_reg_get(struct user_reg __user *ureg, struct user_reg *kreg) if (size > PAGE_SIZE) return -E2BIG; - return copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + if (size < offsetofend(struct user_reg, write_index)) + return -EINVAL; + + ret = copy_struct_from_user(kreg, sizeof(*kreg), ureg, size); + + if (ret) + return ret; + + kreg->size = size; + + return 0; } /* * Registers a user_event on behalf of a user process. */ -static long user_events_ioctl_reg(struct file *file, unsigned long uarg) +static long user_events_ioctl_reg(struct user_event_file_info *info, + unsigned long uarg) { struct user_reg __user *ureg = (struct user_reg __user *)uarg; struct user_reg reg; @@ -1336,24 +1591,24 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) return ret; } - ret = user_event_parse_cmd(name, &user); + ret = user_event_parse_cmd(info->group, name, &user); if (ret) { kfree(name); return ret; } - ret = user_events_ref_add(file, user); + ret = user_events_ref_add(info, user); /* No longer need parse ref, ref_add either worked or not */ - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); /* Positive number is index and valid */ if (ret < 0) return ret; put_user((u32)ret, &ureg->write_index); - put_user(user->index, &ureg->status_index); + put_user(user->index, &ureg->status_bit); return 0; } @@ -1361,7 +1616,8 @@ static long user_events_ioctl_reg(struct file *file, unsigned long uarg) /* * Deletes a user_event on behalf of a user process. */ -static long user_events_ioctl_del(struct file *file, unsigned long uarg) +static long user_events_ioctl_del(struct user_event_file_info *info, + unsigned long uarg) { void __user *ubuf = (void __user *)uarg; char *name; @@ -1374,7 +1630,7 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) /* event_mutex prevents dyn_event from racing */ mutex_lock(&event_mutex); - ret = delete_user_event(name); + ret = delete_user_event(info->group, name); mutex_unlock(&event_mutex); kfree(name); @@ -1388,19 +1644,21 @@ static long user_events_ioctl_del(struct file *file, unsigned long uarg) static long user_events_ioctl(struct file *file, unsigned int cmd, unsigned long uarg) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group = info->group; long ret = -ENOTTY; switch (cmd) { case DIAG_IOCSREG: - mutex_lock(®_mutex); - ret = user_events_ioctl_reg(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_reg(info, uarg); + mutex_unlock(&group->reg_mutex); break; case DIAG_IOCSDEL: - mutex_lock(®_mutex); - ret = user_events_ioctl_del(file, uarg); - mutex_unlock(®_mutex); + mutex_lock(&group->reg_mutex); + ret = user_events_ioctl_del(info, uarg); + mutex_unlock(&group->reg_mutex); break; } @@ -1412,17 +1670,24 @@ static long user_events_ioctl(struct file *file, unsigned int cmd, */ static int user_events_release(struct inode *node, struct file *file) { + struct user_event_file_info *info = file->private_data; + struct user_event_group *group; struct user_event_refs *refs; struct user_event *user; int i; + if (!info) + return -EINVAL; + + group = info->group; + /* * Ensure refs cannot change under any situation by taking the * register mutex during the final freeing of the references. */ - mutex_lock(®_mutex); + mutex_lock(&group->reg_mutex); - refs = file->private_data; + refs = info->refs; if (!refs) goto out; @@ -1436,37 +1701,56 @@ static int user_events_release(struct inode *node, struct file *file) user = refs->events[i]; if (user) - atomic_dec(&user->refcnt); + refcount_dec(&user->refcnt); } out: file->private_data = NULL; - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); kfree(refs); + kfree(info); return 0; } static const struct file_operations user_data_fops = { + .open = user_events_open, .write = user_events_write, .write_iter = user_events_write_iter, .unlocked_ioctl = user_events_ioctl, .release = user_events_release, }; +static struct user_event_group *user_status_group(struct file *file) +{ + struct seq_file *m = file->private_data; + + if (!m) + return NULL; + + return m->private; +} + /* * Maps the shared page into the user process for checking if event is enabled. */ static int user_status_mmap(struct file *file, struct vm_area_struct *vma) { + char *pages; + struct user_event_group *group = user_status_group(file); unsigned long size = vma->vm_end - vma->vm_start; - if (size != MAX_EVENTS) + if (size != MAX_BYTES) + return -EINVAL; + + if (!group) return -EINVAL; + pages = group->register_page_data; + return remap_pfn_range(vma, vma->vm_start, - virt_to_phys(register_page_data) >> PAGE_SHIFT, + virt_to_phys(pages) >> PAGE_SHIFT, size, vm_get_page_prot(VM_READ)); } @@ -1490,14 +1774,18 @@ static void user_seq_stop(struct seq_file *m, void *p) static int user_seq_show(struct seq_file *m, void *p) { + struct user_event_group *group = m->private; struct user_event *user; char status; int i, active = 0, busy = 0, flags; - mutex_lock(®_mutex); + if (!group) + return -EINVAL; + + mutex_lock(&group->reg_mutex); - hash_for_each(register_table, i, user, node) { - status = register_page_data[user->index]; + hash_for_each(group->register_table, i, user, node) { + status = user->status; flags = user->flags; seq_printf(m, "%d:%s", user->index, EVENT_NAME(user)); @@ -1520,7 +1808,7 @@ static int user_seq_show(struct seq_file *m, void *p) active++; } - mutex_unlock(®_mutex); + mutex_unlock(&group->reg_mutex); seq_puts(m, "\n"); seq_printf(m, "Active: %d\n", active); @@ -1539,7 +1827,24 @@ static const struct seq_operations user_seq_ops = { static int user_status_open(struct inode *node, struct file *file) { - return seq_open(file, &user_seq_ops); + struct user_event_group *group; + int ret; + + group = current_user_event_group(); + + if (!group) + return -ENOENT; + + ret = seq_open(file, &user_seq_ops); + + if (!ret) { + /* Chain group to seq_file */ + struct seq_file *m = file->private_data; + + m->private = group; + } + + return ret; } static const struct file_operations user_status_fops = { @@ -1580,42 +1885,21 @@ err: return -ENODEV; } -static void set_page_reservations(bool set) -{ - int page; - - for (page = 0; page < MAX_PAGES; ++page) { - void *addr = register_page_data + (PAGE_SIZE * page); - - if (set) - SetPageReserved(virt_to_page(addr)); - else - ClearPageReserved(virt_to_page(addr)); - } -} - static int __init trace_events_user_init(void) { - struct page *pages; int ret; - /* Zero all bits beside 0 (which is reserved for failures) */ - bitmap_zero(page_bitmap, MAX_EVENTS); - set_bit(0, page_bitmap); + init_group = user_event_group_create(&init_user_ns); - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, MAX_PAGE_ORDER); - if (!pages) + if (!init_group) return -ENOMEM; - register_page_data = page_address(pages); - - set_page_reservations(true); ret = create_user_tracefs(); if (ret) { pr_warn("user_events could not register with tracefs\n"); - set_page_reservations(false); - __free_pages(pages, MAX_PAGE_ORDER); + user_event_group_destroy(init_group); + init_group = NULL; return ret; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 23f7f0ec4f4c..5a75b039e586 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -20,6 +20,7 @@ #include "trace_kprobe_selftest.h" #include "trace_probe.h" #include "trace_probe_tmpl.h" +#include "trace_probe_kernel.h" #define KPROBE_EVENT_SYSTEM "kprobes" #define KRETPROBE_MAXACTIVE_MAX 4096 @@ -1223,29 +1224,14 @@ static const struct file_operations kprobe_profile_ops = { static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { - const void __user *uaddr = (__force const void __user *)addr; - - return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + return kern_fetch_store_strlen_user(addr); } /* Return the length of string -- including null terminal byte */ static nokprobe_inline int fetch_store_strlen(unsigned long addr) { - int ret, len = 0; - u8 c; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if (addr < TASK_SIZE) - return fetch_store_strlen_user(addr); -#endif - - do { - ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); - len++; - } while (c && ret == 0 && len < MAX_STRING_SIZE); - - return (ret < 0) ? ret : len; + return kern_fetch_store_strlen(addr); } /* @@ -1255,21 +1241,7 @@ fetch_store_strlen(unsigned long addr) static nokprobe_inline int fetch_store_string_user(unsigned long addr, void *dest, void *base) { - const void __user *uaddr = (__force const void __user *)addr; - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string_user(addr, dest, base); } /* @@ -1279,29 +1251,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) static nokprobe_inline int fetch_store_string(unsigned long addr, void *dest, void *base) { - int maxlen = get_loc_len(*(u32 *)dest); - void *__dest; - long ret; - -#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE - if ((unsigned long)addr < TASK_SIZE) - return fetch_store_string_user(addr, dest, base); -#endif - - if (unlikely(!maxlen)) - return -ENOMEM; - - __dest = get_loc_data(dest, base); - - /* - * Try to get string again, since the string can be changed while - * probing. - */ - ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); - - return ret; + return kern_fetch_store_string(addr, dest, base); } static nokprobe_inline int diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 313439920a8c..78d536d3ff3d 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -1786,8 +1786,9 @@ static int start_per_cpu_kthreads(void) for_each_cpu(cpu, current_mask) { retval = start_kthread(cpu); if (retval) { + cpus_read_unlock(); stop_per_cpu_kthreads(); - break; + return retval; } } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 3b3869ae8cfd..de38f1c03776 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -445,7 +445,8 @@ extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, C(SAME_PROBE, "There is already the exact same probe event"),\ C(NO_EVENT_INFO, "This requires both group and event name to attach"),\ C(BAD_ATTACH_EVENT, "Attached event does not exist"),\ - C(BAD_ATTACH_ARG, "Attached event does not have this field"), + C(BAD_ATTACH_ARG, "Attached event does not have this field"),\ + C(NO_EP_FILTER, "No filter rule after 'if'"), #undef C #define C(a, b) TP_ERR_##a diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h new file mode 100644 index 000000000000..77dbd9ff9782 --- /dev/null +++ b/kernel/trace/trace_probe_kernel.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __TRACE_PROBE_KERNEL_H_ +#define __TRACE_PROBE_KERNEL_H_ + +#define FAULT_STRING "(fault)" + +/* + * This depends on trace_probe.h, but can not include it due to + * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. + * Which means that any other user must include trace_probe.h before including + * this file. + */ +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen_user(unsigned long addr) +{ + const void __user *uaddr = (__force const void __user *)addr; + int ret; + + ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); + /* + * strnlen_user_nofault returns zero on fault, insert the + * FAULT_STRING when that occurs. + */ + if (ret <= 0) + return strlen(FAULT_STRING) + 1; + return ret; +} + +/* Return the length of string -- including null terminal byte */ +static nokprobe_inline int +kern_fetch_store_strlen(unsigned long addr) +{ + int ret, len = 0; + u8 c; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if (addr < TASK_SIZE) + return kern_fetch_store_strlen_user(addr); +#endif + + do { + ret = copy_from_kernel_nofault(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + + /* For faults, return enough to hold the FAULT_STRING */ + return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; +} + +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) +{ + if (ret >= 0) { + *(u32 *)dest = make_data_loc(ret, __dest - base); + } else { + strscpy(__dest, FAULT_STRING, len); + ret = strlen(__dest) + 1; + } +} + +/* + * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf + * with max length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string_user(unsigned long addr, void *dest, void *base) +{ + const void __user *uaddr = (__force const void __user *)addr; + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); + set_data_loc(ret, dest, __dest, base, maxlen); + + return ret; +} + +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)buf with max + * length and relative data location. + */ +static nokprobe_inline int +kern_fetch_store_string(unsigned long addr, void *dest, void *base) +{ + int maxlen = get_loc_len(*(u32 *)dest); + void *__dest; + long ret; + +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + if ((unsigned long)addr < TASK_SIZE) + return kern_fetch_store_string_user(addr, dest, base); +#endif + + if (unlikely(!maxlen)) + return -ENOMEM; + + __dest = get_loc_data(dest, base); + + /* + * Try to get string again, since the string can be changed while + * probing. + */ + ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); + set_data_loc(ret, dest, __dest, base, maxlen); + + return ret; +} + +#endif /* __TRACE_PROBE_KERNEL_H_ */ diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 9901708ce6b8..c774e560f2f9 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -961,7 +961,7 @@ create_sort_entry(void *key, struct tracing_map_elt *elt) static void detect_dups(struct tracing_map_sort_entry **sort_entries, int n_entries, unsigned int key_size) { - unsigned int dups = 0, total_dups = 0; + unsigned int total_dups = 0; int i; void *key; @@ -974,11 +974,10 @@ static void detect_dups(struct tracing_map_sort_entry **sort_entries, key = sort_entries[0]->key; for (i = 1; i < n_entries; i++) { if (!memcmp(sort_entries[i]->key, key, key_size)) { - dups++; total_dups++; + total_dups++; continue; } key = sort_entries[i]->key; - dups = 0; } WARN_ONCE(total_dups > 0, diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ef42c1a11920..f23144af5743 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -640,7 +640,6 @@ static void tp_module_going_check_quiescent(struct tracepoint *tp, void *priv) static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod; - int ret = 0; if (!mod->num_tracepoints) return 0; @@ -652,19 +651,18 @@ static int tracepoint_module_coming(struct module *mod) */ if (trace_module_has_bad_taint(mod)) return 0; - mutex_lock(&tracepoint_module_list_mutex); + tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); - if (!tp_mod) { - ret = -ENOMEM; - goto end; - } + if (!tp_mod) + return -ENOMEM; tp_mod->mod = mod; + + mutex_lock(&tracepoint_module_list_mutex); list_add_tail(&tp_mod->list, &tracepoint_module_list); blocking_notifier_call_chain(&tracepoint_notify_list, MODULE_STATE_COMING, tp_mod); -end: mutex_unlock(&tracepoint_module_list_mutex); - return ret; + return 0; } static void tracepoint_module_going(struct module *mod) diff --git a/kernel/ucount.c b/kernel/ucount.c index 06ea04d44685..ee8e57fd6f90 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -87,10 +87,6 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_marks"), #endif - { }, - { }, - { }, - { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -263,29 +259,29 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } -long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +long inc_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long max = LONG_MAX; long ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(v, &iter->ucount[type]); + long new = atomic_long_add_return(v, &iter->rlimit[type]); if (new < 0 || new > max) ret = LONG_MAX; else if (iter == ucounts) ret = new; - max = READ_ONCE(iter->ns->ucount_max[type]); + max = get_userns_rlimit_max(iter->ns, type); } return ret; } -bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum rlimit_type type, long v) { struct ucounts *iter; long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long dec = atomic_long_sub_return(v, &iter->ucount[type]); + long dec = atomic_long_sub_return(v, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); if (iter == ucounts) new = dec; @@ -294,11 +290,11 @@ bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) } static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, - struct ucounts *last, enum ucount_type type) + struct ucounts *last, enum rlimit_type type) { struct ucounts *iter, *next; for (iter = ucounts; iter != last; iter = next) { - long dec = atomic_long_sub_return(1, &iter->ucount[type]); + long dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); next = iter->ns->ucounts; if (dec == 0) @@ -306,12 +302,12 @@ static void do_dec_rlimit_put_ucounts(struct ucounts *ucounts, } } -void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum ucount_type type) +void dec_rlimit_put_ucounts(struct ucounts *ucounts, enum rlimit_type type) { do_dec_rlimit_put_ucounts(ucounts, NULL, type); } -long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) +long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum rlimit_type type) { /* Caller must hold a reference to ucounts */ struct ucounts *iter; @@ -319,12 +315,12 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) long dec, ret = 0; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long new = atomic_long_add_return(1, &iter->ucount[type]); + long new = atomic_long_add_return(1, &iter->rlimit[type]); if (new < 0 || new > max) goto unwind; if (iter == ucounts) ret = new; - max = READ_ONCE(iter->ns->ucount_max[type]); + max = get_userns_rlimit_max(iter->ns, type); /* * Grab an extra ucount reference for the caller when * the rlimit count was previously 0. @@ -336,24 +332,24 @@ long inc_rlimit_get_ucounts(struct ucounts *ucounts, enum ucount_type type) } return ret; dec_unwind: - dec = atomic_long_sub_return(1, &iter->ucount[type]); + dec = atomic_long_sub_return(1, &iter->rlimit[type]); WARN_ON_ONCE(dec < 0); unwind: do_dec_rlimit_put_ucounts(ucounts, iter, type); return 0; } -bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long rlimit) +bool is_rlimit_overlimit(struct ucounts *ucounts, enum rlimit_type type, unsigned long rlimit) { struct ucounts *iter; long max = rlimit; if (rlimit > LONG_MAX) max = LONG_MAX; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - long val = get_ucounts_value(iter, type); + long val = get_rlimit_value(iter, type); if (val < 0 || val > max) return true; - max = READ_ONCE(iter->ns->ucount_max[type]); + max = get_userns_rlimit_max(iter->ns, type); } return false; } diff --git a/kernel/umh.c b/kernel/umh.c index b989736e8707..850631518665 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -28,6 +28,7 @@ #include <linux/async.h> #include <linux/uaccess.h> #include <linux/initrd.h> +#include <linux/freezer.h> #include <trace/events/module.h> @@ -403,6 +404,7 @@ EXPORT_SYMBOL(call_usermodehelper_setup); */ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) { + unsigned int state = TASK_UNINTERRUPTIBLE; DECLARE_COMPLETION_ONSTACK(done); int retval = 0; @@ -436,18 +438,22 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) if (wait == UMH_NO_WAIT) /* task has freed sub_info */ goto unlock; - if (wait & UMH_KILLABLE) { - retval = wait_for_completion_killable(&done); - if (!retval) - goto wait_done; + if (wait & UMH_KILLABLE) + state |= TASK_KILLABLE; + + if (wait & UMH_FREEZABLE) + state |= TASK_FREEZABLE; + retval = wait_for_completion_state(&done, state); + if (!retval) + goto wait_done; + + if (wait & UMH_KILLABLE) { /* umh_complete() will see NULL and free sub_info */ if (xchg(&sub_info->complete, NULL)) goto unlock; - /* fallthrough, umh_complete() was already called */ } - wait_for_completion(&done); wait_done: retval = sub_info->retval; out: diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 3f464bbda0e9..54211dbd516c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -136,13 +136,13 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { + for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); - set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); + set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4ca61d49885b..064072c16e3d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> +#include <linux/random.h> #include <linux/sysctl.h> #include <linux/wait.h> #include <linux/rwsem.h> @@ -57,6 +58,7 @@ static int proc_do_uts_string(struct ctl_table *table, int write, * theoretically be incorrect if there are two parallel writes * at non-zero offsets to the same sysctl. */ + add_device_randomness(tmp_data, sizeof(tmp_data)); down_write(&uts_sem); memcpy(get_uts(table), tmp_data, sizeof(tmp_data)); up_write(&uts_sem); @@ -74,6 +76,13 @@ static DEFINE_CTL_TABLE_POLL(domainname_poll); static struct ctl_table uts_kern_table[] = { { + .procname = "arch", + .data = init_uts_ns.name.machine, + .maxlen = sizeof(init_uts_ns.name.machine), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, + { .procname = "ostype", .data = init_uts_ns.name.sysname, .maxlen = sizeof(init_uts_ns.name.sysname), |