summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/cgroup.c513
-rw-r--r--kernel/bpf/core.c71
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/lpm_trie.c95
-rw-r--r--kernel/bpf/sockmap.c4
-rw-r--r--kernel/bpf/syscall.c135
-rw-r--r--kernel/bpf/verifier.c184
-rw-r--r--kernel/cgroup/cgroup.c46
-rw-r--r--kernel/cpu.c506
-rw-r--r--kernel/events/core.c3
-rw-r--r--kernel/events/ring_buffer.c20
-rw-r--r--kernel/exit.c23
-rw-r--r--kernel/extable.c45
-rw-r--r--kernel/fork.c18
-rw-r--r--kernel/futex.c33
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/irqdomain.c4
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/memremap.c4
-rw-r--r--kernel/params.c35
-rw-r--r--kernel/power/suspend.c18
-rw-r--r--kernel/rcu/tree.c10
-rw-r--r--kernel/sched/core.c24
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/seccomp.c344
-rw-r--r--kernel/sysctl.c5
-rw-r--r--kernel/trace/blktrace.c18
-rw-r--r--kernel/trace/ftrace.c14
-rw-r--r--kernel/trace/trace.c19
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_stack.c15
37 files changed, 1791 insertions, 491 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..e88abc0865d5 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,407 @@ void cgroup_bpf_put(struct cgroup *cgrp)
{
unsigned int type;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
- struct bpf_prog *prog = cgrp->bpf.prog[type];
-
- if (prog) {
- bpf_prog_put(prog);
+ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog_list *pl, *tmp;
+
+ list_for_each_entry_safe(pl, tmp, progs, node) {
+ list_del(&pl->node);
+ bpf_prog_put(pl->prog);
+ kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key);
}
+ bpf_prog_array_free(cgrp->bpf.effective[type]);
+ }
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+ struct bpf_prog_list *pl;
+ u32 cnt = 0;
+
+ list_for_each_entry(pl, head, node) {
+ if (!pl->prog)
+ continue;
+ cnt++;
}
+ return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ u32 new_flags)
+{
+ struct cgroup *p;
+
+ p = cgroup_parent(cgrp);
+ if (!p)
+ return true;
+ do {
+ u32 flags = p->bpf.flags[type];
+ u32 cnt;
+
+ if (flags & BPF_F_ALLOW_MULTI)
+ return true;
+ cnt = prog_list_length(&p->bpf.progs[type]);
+ WARN_ON_ONCE(cnt > 1);
+ if (cnt == 1)
+ return !!(flags & BPF_F_ALLOW_OVERRIDE);
+ p = cgroup_parent(p);
+ } while (p);
+ return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu **array)
+{
+ struct bpf_prog_array __rcu *progs;
+ struct bpf_prog_list *pl;
+ struct cgroup *p = cgrp;
+ int cnt = 0;
+
+ /* count number of effective programs by walking parents */
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ cnt += prog_list_length(&p->bpf.progs[type]);
+ p = cgroup_parent(p);
+ } while (p);
+
+ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+ if (!progs)
+ return -ENOMEM;
+
+ /* populate the array with effective progs */
+ cnt = 0;
+ p = cgrp;
+ do {
+ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+ list_for_each_entry(pl,
+ &p->bpf.progs[type], node) {
+ if (!pl->prog)
+ continue;
+ rcu_dereference_protected(progs, 1)->
+ progs[cnt++] = pl->prog;
+ }
+ p = cgroup_parent(p);
+ } while (p);
+
+ *array = progs;
+ return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+ enum bpf_attach_type type,
+ struct bpf_prog_array __rcu *array)
+{
+ struct bpf_prog_array __rcu *old_array;
+
+ old_array = xchg(&cgrp->bpf.effective[type], array);
+ /* free prog array after grace period, since __cgroup_bpf_run_*()
+ * might be still walking the array
+ */
+ bpf_prog_array_free(old_array);
}
/**
* cgroup_bpf_inherit() - inherit effective programs from parent
* @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
*/
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
{
- unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define NR ARRAY_SIZE(cgrp->bpf.effective)
+ struct bpf_prog_array __rcu *arrays[NR] = {};
+ int i;
- for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
- struct bpf_prog *e;
+ for (i = 0; i < NR; i++)
+ INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
- e = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- rcu_assign_pointer(cgrp->bpf.effective[type], e);
- cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
- }
+ for (i = 0; i < NR; i++)
+ if (compute_effective_progs(cgrp, i, &arrays[i]))
+ goto cleanup;
+
+ for (i = 0; i < NR; i++)
+ activate_effective_progs(cgrp, i, arrays[i]);
+
+ return 0;
+cleanup:
+ for (i = 0; i < NR; i++)
+ bpf_prog_array_free(arrays[i]);
+ return -ENOMEM;
}
+#define BPF_CGROUP_MAX_PROGS 64
+
/**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
*
* Must be called with cgroup_mutex held.
*/
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
- struct bpf_prog *prog, enum bpf_attach_type type,
- bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
{
- struct bpf_prog *old_prog, *effective = NULL;
- struct cgroup_subsys_state *pos;
- bool overridable = true;
-
- if (parent) {
- overridable = !parent->bpf.disallow_override[type];
- effective = rcu_dereference_protected(parent->bpf.effective[type],
- lockdep_is_held(&cgroup_mutex));
- }
-
- if (prog && effective && !overridable)
- /* if parent has non-overridable prog attached, disallow
- * attaching new programs to descendent cgroup
- */
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ bool pl_was_allocated;
+ u32 old_flags;
+ int err;
+
+ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+ /* invalid combination */
+ return -EINVAL;
+
+ if (!hierarchy_allows_attach(cgrp, type, flags))
return -EPERM;
- if (prog && effective && overridable != new_overridable)
- /* if parent has overridable prog attached, only
- * allow overridable programs in descendent cgroup
+ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+ /* Disallow attaching non-overridable on top
+ * of existing overridable in this cgroup.
+ * Disallow attaching multi-prog if overridable or none
*/
return -EPERM;
- old_prog = cgrp->bpf.prog[type];
-
- if (prog) {
- overridable = new_overridable;
- effective = prog;
- if (old_prog &&
- cgrp->bpf.disallow_override[type] == new_overridable)
- /* disallow attaching non-overridable on top
- * of existing overridable in this cgroup
- * and vice versa
- */
- return -EPERM;
+ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+ return -E2BIG;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ list_for_each_entry(pl, progs, node)
+ if (pl->prog == prog)
+ /* disallow attaching the same prog twice */
+ return -EINVAL;
+
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ pl->prog = prog;
+ list_add_tail(&pl->node, progs);
+ } else {
+ if (list_empty(progs)) {
+ pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+ if (!pl)
+ return -ENOMEM;
+ pl_was_allocated = true;
+ list_add_tail(&pl->node, progs);
+ } else {
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl_was_allocated = false;
+ }
+ pl->prog = prog;
}
- if (!prog && !old_prog)
- /* report error when trying to detach and nothing is attached */
- return -ENOENT;
-
- cgrp->bpf.prog[type] = prog;
+ old_flags = cgrp->bpf.flags[type];
+ cgrp->bpf.flags[type] = flags;
- css_for_each_descendant_pre(pos, &cgrp->self) {
- struct cgroup *desc = container_of(pos, struct cgroup, self);
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
- /* skip the subtree if the descendant has its own program */
- if (desc->bpf.prog[type] && desc != cgrp) {
- pos = css_rightmost_descendant(pos);
- } else {
- rcu_assign_pointer(desc->bpf.effective[type],
- effective);
- desc->bpf.disallow_override[type] = !overridable;
- }
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
}
- if (prog)
- static_branch_inc(&cgroup_bpf_enabled_key);
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+ static_branch_inc(&cgroup_bpf_enabled_key);
if (old_prog) {
bpf_prog_put(old_prog);
static_branch_dec(&cgroup_bpf_enabled_key);
}
return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and cleanup the prog list */
+ pl->prog = old_prog;
+ if (pl_was_allocated) {
+ list_del(&pl->node);
+ kfree(pl);
+ }
+ return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ * propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 unused_flags)
+{
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ struct bpf_prog *old_prog = NULL;
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_list *pl;
+ int err;
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ if (!prog)
+ /* to detach MULTI prog the user has to specify valid FD
+ * of the program to be detached
+ */
+ return -EINVAL;
+ } else {
+ if (list_empty(progs))
+ /* report error when trying to detach and nothing is attached */
+ return -ENOENT;
+ }
+
+ if (flags & BPF_F_ALLOW_MULTI) {
+ /* find the prog and detach it */
+ list_for_each_entry(pl, progs, node) {
+ if (pl->prog != prog)
+ continue;
+ old_prog = prog;
+ /* mark it deleted, so it's ignored while
+ * recomputing effective
+ */
+ pl->prog = NULL;
+ break;
+ }
+ if (!old_prog)
+ return -ENOENT;
+ } else {
+ /* to maintain backward compatibility NONE and OVERRIDE cgroups
+ * allow detaching with invalid FD (prog==NULL)
+ */
+ pl = list_first_entry(progs, typeof(*pl), node);
+ old_prog = pl->prog;
+ pl->prog = NULL;
+ }
+
+ /* allocate and recompute effective prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+ if (err)
+ goto cleanup;
+ }
+
+ /* all allocations were successful. Activate all prog arrays */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ activate_effective_progs(desc, type, desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* now can actually delete it from this cgroup list */
+ list_del(&pl->node);
+ kfree(pl);
+ if (list_empty(progs))
+ /* last program was detached, reset flags to zero */
+ cgrp->bpf.flags[type] = 0;
+
+ bpf_prog_put(old_prog);
+ static_branch_dec(&cgroup_bpf_enabled_key);
+ return 0;
+
+cleanup:
+ /* oom while computing effective. Free all computed effective arrays
+ * since they were not activated
+ */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ bpf_prog_array_free(desc->bpf.inactive);
+ desc->bpf.inactive = NULL;
+ }
+
+ /* and restore back old_prog */
+ pl->prog = old_prog;
+ return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+ enum bpf_attach_type type = attr->query.attach_type;
+ struct list_head *progs = &cgrp->bpf.progs[type];
+ u32 flags = cgrp->bpf.flags[type];
+ int cnt, ret = 0, i;
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+ cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+ else
+ cnt = prog_list_length(progs);
+
+ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+ return -EFAULT;
+ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ return -EFAULT;
+ if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ /* return early if user requested only program count + flags */
+ return 0;
+ if (attr->query.prog_cnt < cnt) {
+ cnt = attr->query.prog_cnt;
+ ret = -ENOSPC;
+ }
+
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+ prog_ids, cnt);
+ } else {
+ struct bpf_prog_list *pl;
+ u32 id;
+
+ i = 0;
+ list_for_each_entry(pl, progs, node) {
+ id = pl->prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
+ }
+ return ret;
}
/**
@@ -171,36 +449,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
struct sk_buff *skb,
enum bpf_attach_type type)
{
- struct bpf_prog *prog;
+ unsigned int offset = skb->data - skb_network_header(skb);
+ struct sock *save_sk;
struct cgroup *cgrp;
- int ret = 0;
+ int ret;
if (!sk || !sk_fullsock(sk))
return 0;
- if (sk->sk_family != AF_INET &&
- sk->sk_family != AF_INET6)
+ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
return 0;
cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog) {
- unsigned int offset = skb->data - skb_network_header(skb);
- struct sock *save_sk = skb->sk;
-
- skb->sk = sk;
- __skb_push(skb, offset);
- ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
- __skb_pull(skb, offset);
- skb->sk = save_sk;
- }
-
- rcu_read_unlock();
-
- return ret;
+ save_sk = skb->sk;
+ skb->sk = sk;
+ __skb_push(skb, offset);
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+ bpf_prog_run_save_cb);
+ __skb_pull(skb, offset);
+ skb->sk = save_sk;
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
@@ -221,19 +489,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
-
-
- rcu_read_lock();
-
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
- rcu_read_unlock();
+ int ret;
- return ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
@@ -258,18 +517,10 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
enum bpf_attach_type type)
{
struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
- struct bpf_prog *prog;
- int ret = 0;
-
-
- rcu_read_lock();
+ int ret;
- prog = rcu_dereference(cgrp->bpf.effective[type]);
- if (prog)
- ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
-
- rcu_read_unlock();
-
- return ret;
+ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+ BPF_PROG_RUN);
+ return ret == 1 ? 0 : -EPERM;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 917cc04a0a94..c6be15ae83ee 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1022,7 +1022,7 @@ select_insn:
struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_prog *prog;
- u64 index = BPF_R3;
+ u32 index = BPF_R3;
if (unlikely(index >= array->map.max_entries))
goto out;
@@ -1381,6 +1381,75 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+ struct bpf_prog_array hdr;
+ struct bpf_prog *null_prog;
+} empty_prog_array = {
+ .null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+ if (prog_cnt)
+ return kzalloc(sizeof(struct bpf_prog_array) +
+ sizeof(struct bpf_prog *) * (prog_cnt + 1),
+ flags);
+
+ return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+ if (!progs ||
+ progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+ return;
+ kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+ struct bpf_prog **prog;
+ u32 cnt = 0;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++)
+ cnt++;
+ rcu_read_unlock();
+ return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+ __u32 __user *prog_ids, u32 cnt)
+{
+ struct bpf_prog **prog;
+ u32 i = 0, id;
+
+ rcu_read_lock();
+ prog = rcu_dereference(progs)->progs;
+ for (; *prog; prog++) {
+ id = (*prog)->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+ rcu_read_unlock();
+ return -EFAULT;
+ }
+ if (++i == cnt) {
+ prog++;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ if (*prog)
+ return -ENOSPC;
+ return 0;
+}
+
static void bpf_prog_free_deferred(struct work_struct *work)
{
struct bpf_prog_aux *aux;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 959c9a07f318..e093d9a2c4dd 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -75,8 +75,8 @@ static u64 dev_map_bitmap_size(const union bpf_attr *attr)
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
+ int err = -EINVAL;
u64 cost;
- int err;
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
@@ -108,6 +108,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
if (err)
goto free_dtab;
+ err = -ENOMEM;
+
/* A per cpu bitfield with a bit per possible net device */
dtab->flush_needed = __alloc_percpu(dev_map_bitmap_size(attr),
__alignof__(unsigned long));
@@ -128,7 +130,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}
static void dev_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..34d8a690ea05 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,99 @@ out:
return ret;
}
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
{
- /* TODO */
- return -ENOSYS;
+ struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+ struct bpf_lpm_trie_key *key = _key;
+ struct lpm_trie_node __rcu **trim, **trim2;
+ struct lpm_trie_node *node, *parent;
+ unsigned long irq_flags;
+ unsigned int next_bit;
+ size_t matchlen = 0;
+ int ret = 0;
+
+ if (key->prefixlen > trie->max_prefixlen)
+ return -EINVAL;
+
+ raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+ /* Walk the tree looking for an exact key/length match and keeping
+ * track of the path we traverse. We will need to know the node
+ * we wish to delete, and the slot that points to the node we want
+ * to delete. We may also need to know the nodes parent and the
+ * slot that contains it.
+ */
+ trim = &trie->root;
+ trim2 = trim;
+ parent = NULL;
+ while ((node = rcu_dereference_protected(
+ *trim, lockdep_is_held(&trie->lock)))) {
+ matchlen = longest_prefix_match(trie, node, key);
+
+ if (node->prefixlen != matchlen ||
+ node->prefixlen == key->prefixlen)
+ break;
+
+ parent = node;
+ trim2 = trim;
+ next_bit = extract_bit(key->data, node->prefixlen);
+ trim = &node->child[next_bit];
+ }
+
+ if (!node || node->prefixlen != key->prefixlen ||
+ (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ trie->n_entries--;
+
+ /* If the node we are removing has two children, simply mark it
+ * as intermediate and we are done.
+ */
+ if (rcu_access_pointer(node->child[0]) &&
+ rcu_access_pointer(node->child[1])) {
+ node->flags |= LPM_TREE_NODE_FLAG_IM;
+ goto out;
+ }
+
+ /* If the parent of the node we are about to delete is an intermediate
+ * node, and the deleted node doesn't have any children, we can delete
+ * the intermediate parent as well and promote its other child
+ * up the tree. Doing this maintains the invariant that all
+ * intermediate nodes have exactly 2 children and that there are no
+ * unnecessary intermediate nodes in the tree.
+ */
+ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+ !node->child[0] && !node->child[1]) {
+ if (node == rcu_access_pointer(parent->child[0]))
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[1]));
+ else
+ rcu_assign_pointer(
+ *trim2, rcu_access_pointer(parent->child[0]));
+ kfree_rcu(parent, rcu);
+ kfree_rcu(node, rcu);
+ goto out;
+ }
+
+ /* The node we are removing has either zero or one child. If there
+ * is a child, move it into the removed node's slot then delete
+ * the node. Otherwise just clear the slot and delete the node.
+ */
+ if (node->child[0])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+ else if (node->child[1])
+ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+ else
+ RCU_INIT_POINTER(*trim, NULL);
+ kfree_rcu(node, rcu);
+
+out:
+ raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+ return ret;
}
#define LPM_DATA_SIZE_MAX 256
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 6424ce0e4969..a298d6666698 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -102,7 +102,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
skb_orphan(skb);
skb->sk = psock->sock;
- bpf_compute_data_end(skb);
+ bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
@@ -369,7 +369,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
* any socket yet.
*/
skb->sk = psock->sock;
- bpf_compute_data_end(skb);
+ bpf_compute_data_pointers(skb);
rc = (*prog->bpf_func)(skb, prog->insnsi);
skb->sk = NULL;
rcu_read_unlock();
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cb17e1cd1d43..0048cb24ba7b 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -186,15 +189,17 @@ static int bpf_map_alloc_id(struct bpf_map *map)
static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
{
+ unsigned long flags;
+
if (do_idr_lock)
- spin_lock_bh(&map_idr_lock);
+ spin_lock_irqsave(&map_idr_lock, flags);
else
__acquire(&map_idr_lock);
idr_remove(&map_idr, map->id);
if (do_idr_lock)
- spin_unlock_bh(&map_idr_lock);
+ spin_unlock_irqrestore(&map_idr_lock, flags);
else
__release(&map_idr_lock);
}
@@ -310,7 +315,31 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+ const char *end = src + BPF_OBJ_NAME_LEN;
+
+ /* Copy all isalnum() and '_' char */
+ while (src < end && *src) {
+ if (!isalnum(*src) && *src != '_')
+ return -EINVAL;
+ *dst++ = *src++;
+ }
+
+ /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+ if (src == end)
+ return -EINVAL;
+
+ /* '\0' terminates dst */
+ *dst = 0;
+
+ return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@@ -332,6 +361,10 @@ static int map_create(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
+ err = bpf_obj_name_cpy(map->name, attr->map_name);
+ if (err)
+ goto free_map_nouncharge;
+
atomic_set(&map->refcnt, 1);
atomic_set(&map->usercnt, 1);
@@ -971,7 +1004,7 @@ struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
EXPORT_SYMBOL_GPL(bpf_prog_get_type);
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define BPF_PROG_LOAD_LAST_FIELD prog_name
static int bpf_prog_load(union bpf_attr *attr)
{
@@ -1035,6 +1068,11 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_prog;
+ prog->aux->load_time = ktime_get_boot_ns();
+ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+ if (err)
+ goto free_prog;
+
/* run eBPF verifier */
err = bpf_check(&prog, attr);
if (err < 0)
@@ -1130,6 +1168,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
return 0;
}
+#define BPF_F_ATTACH_MASK \
+ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
static int bpf_prog_attach(const union bpf_attr *attr)
{
enum bpf_prog_type ptype;
@@ -1143,7 +1184,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
if (CHECK_ATTR(BPF_PROG_ATTACH))
return -EINVAL;
- if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+ if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
return -EINVAL;
switch (attr->attach_type) {
@@ -1174,8 +1215,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
return PTR_ERR(cgrp);
}
- ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
- attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+ attr->attach_flags);
if (ret)
bpf_prog_put(prog);
cgroup_put(cgrp);
@@ -1187,6 +1228,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
static int bpf_prog_detach(const union bpf_attr *attr)
{
+ enum bpf_prog_type ptype;
+ struct bpf_prog *prog;
struct cgroup *cgrp;
int ret;
@@ -1199,26 +1242,67 @@ static int bpf_prog_detach(const union bpf_attr *attr)
switch (attr->attach_type) {
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
+ ptype = BPF_PROG_TYPE_CGROUP_SKB;
+ break;
case BPF_CGROUP_INET_SOCK_CREATE:
+ ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+ break;
case BPF_CGROUP_SOCK_OPS:
- cgrp = cgroup_get_from_fd(attr->target_fd);
- if (IS_ERR(cgrp))
- return PTR_ERR(cgrp);
-
- ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
- cgroup_put(cgrp);
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
break;
case BPF_SK_SKB_STREAM_PARSER:
case BPF_SK_SKB_STREAM_VERDICT:
- ret = sockmap_get_from_fd(attr, false);
- break;
+ return sockmap_get_from_fd(attr, false);
default:
return -EINVAL;
}
+ cgrp = cgroup_get_from_fd(attr->target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+
+ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+ if (IS_ERR(prog))
+ prog = NULL;
+
+ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+ if (prog)
+ bpf_prog_put(prog);
+ cgroup_put(cgrp);
return ret;
}
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct cgroup *cgrp;
+ int ret;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+ if (CHECK_ATTR(BPF_PROG_QUERY))
+ return -EINVAL;
+ if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+ return -EINVAL;
+
+ switch (attr->query.attach_type) {
+ case BPF_CGROUP_INET_INGRESS:
+ case BPF_CGROUP_INET_EGRESS:
+ case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
+ break;
+ default:
+ return -EINVAL;
+ }
+ cgrp = cgroup_get_from_fd(attr->query.target_fd);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+ ret = cgroup_bpf_query(cgrp, attr, uattr);
+ cgroup_put(cgrp);
+ return ret;
+}
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1356,8 +1440,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
info.type = prog->type;
info.id = prog->aux->id;
+ info.load_time = prog->aux->load_time;
+ info.created_by_uid = from_kuid_munged(current_user_ns(),
+ prog->aux->user->uid);
memcpy(info.tag, prog->tag, sizeof(prog->tag));
+ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+ ulen = info.nr_map_ids;
+ info.nr_map_ids = prog->aux->used_map_cnt;
+ ulen = min_t(u32, info.nr_map_ids, ulen);
+ if (ulen) {
+ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+ u32 i;
+
+ for (i = 0; i < ulen; i++)
+ if (put_user(prog->aux->used_maps[i]->id,
+ &user_map_ids[i]))
+ return -EFAULT;
+ }
if (!capable(CAP_SYS_ADMIN)) {
info.jited_prog_len = 0;
@@ -1411,6 +1512,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
+ memcpy(info.name, map->name, sizeof(map->name));
if (copy_to_user(uinfo, &info, info_len) ||
put_user(info_len, &uattr->info.info_len))
@@ -1497,6 +1599,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_DETACH:
err = bpf_prog_detach(&attr);
break;
+ case BPF_PROG_QUERY:
+ err = bpf_prog_query(&attr, uattr);
+ break;
#endif
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 799b2451ef2d..52b022310f6a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -177,6 +177,12 @@ static __printf(1, 2) void verbose(const char *fmt, ...)
va_end(args);
}
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+ return type == PTR_TO_PACKET ||
+ type == PTR_TO_PACKET_META;
+}
+
/* string representation of 'enum bpf_reg_type' */
static const char * const reg_type_str[] = {
[NOT_INIT] = "?",
@@ -187,6 +193,7 @@ static const char * const reg_type_str[] = {
[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
[PTR_TO_STACK] = "fp",
[PTR_TO_PACKET] = "pkt",
+ [PTR_TO_PACKET_META] = "pkt_meta",
[PTR_TO_PACKET_END] = "pkt_end",
};
@@ -226,7 +233,7 @@ static void print_verifier_state(struct bpf_verifier_state *state)
verbose("(id=%d", reg->id);
if (t != SCALAR_VALUE)
verbose(",off=%d", reg->off);
- if (t == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(t))
verbose(",r=%d", reg->range);
else if (t == CONST_PTR_TO_MAP ||
t == PTR_TO_MAP_VALUE ||
@@ -325,26 +332,45 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
+static void print_bpf_end_insn(const struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ verbose("(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+ insn->imm, insn->dst_reg);
+}
+
static void print_bpf_insn(const struct bpf_verifier_env *env,
const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
if (class == BPF_ALU || class == BPF_ALU64) {
- if (BPF_SRC(insn->code) == BPF_X)
+ if (BPF_OP(insn->code) == BPF_END) {
+ if (class == BPF_ALU64)
+ verbose("BUG_alu64_%02x\n", insn->code);
+ else
+ print_bpf_end_insn(env, insn);
+ } else if (BPF_OP(insn->code) == BPF_NEG) {
+ verbose("(%02x) r%d = %s-r%d\n",
+ insn->code, insn->dst_reg,
+ class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
verbose("(%02x) %sr%d %s %sr%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
class == BPF_ALU ? "(u32) " : "",
insn->src_reg);
- else
+ } else {
verbose("(%02x) %sr%d %s %s%d\n",
insn->code, class == BPF_ALU ? "(u32) " : "",
insn->dst_reg,
bpf_alu_string[BPF_OP(insn->code) >> 4],
class == BPF_ALU ? "(u32) " : "",
insn->imm);
+ }
} else if (class == BPF_STX) {
if (BPF_MODE(insn->code) == BPF_MEM)
verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
@@ -519,6 +545,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
__mark_reg_known_zero(regs + regno);
}
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+ return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+ return reg_is_pkt_pointer(reg) ||
+ reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+ enum bpf_reg_type which)
+{
+ /* The register can already have a range from prior markings.
+ * This is fine as long as it hasn't been advanced from its
+ * origin.
+ */
+ return reg->type == which &&
+ reg->id == 0 &&
+ reg->off == 0 &&
+ tnum_equals_const(reg->var_off, 0);
+}
+
/* Attempts to improve min/max values based on var_off information */
static void __update_reg_bounds(struct bpf_reg_state *reg)
{
@@ -702,6 +753,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
case PTR_TO_STACK:
case PTR_TO_CTX:
case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET_END:
case CONST_PTR_TO_MAP:
return true;
@@ -1047,7 +1099,10 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
switch (reg->type) {
case PTR_TO_PACKET:
- /* special case, because of NET_IP_ALIGN */
+ case PTR_TO_PACKET_META:
+ /* Special case, because of NET_IP_ALIGN. Given metadata sits
+ * right in front, treat it the very same way.
+ */
return check_pkt_ptr_alignment(reg, off, size, strict);
case PTR_TO_MAP_VALUE:
pointer_desc = "value ";
@@ -1124,8 +1179,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
/* ctx access returns either a scalar, or a
- * PTR_TO_PACKET[_END]. In the latter case, we know
- * the offset is zero.
+ * PTR_TO_PACKET[_META,_END]. In the latter
+ * case, we know the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
mark_reg_unknown(state->regs, value_regno);
@@ -1170,7 +1225,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else {
err = check_stack_read(state, off, size, value_regno);
}
- } else if (reg->type == PTR_TO_PACKET) {
+ } else if (reg_is_pkt_pointer(reg)) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
return -EACCES;
@@ -1310,6 +1365,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
switch (reg->type) {
case PTR_TO_PACKET:
+ case PTR_TO_PACKET_META:
return check_packet_access(env, regno, reg->off, access_size);
case PTR_TO_MAP_VALUE:
return check_map_access(env, regno, reg->off, access_size);
@@ -1342,7 +1398,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
- if (type == PTR_TO_PACKET &&
+ if (type_is_pkt_pointer(type) &&
!may_access_direct_pkt_data(env, meta, BPF_READ)) {
verbose("helper access to the packet is not allowed\n");
return -EACCES;
@@ -1351,7 +1407,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
if (arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
expected_type = PTR_TO_STACK;
- if (type != PTR_TO_PACKET && type != expected_type)
+ if (!type_is_pkt_pointer(type) &&
+ type != expected_type)
goto err_type;
} else if (arg_type == ARG_CONST_SIZE ||
arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,7 +1432,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
*/
if (register_is_null(*reg))
/* final test in check_stack_boundary() */;
- else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+ else if (!type_is_pkt_pointer(type) &&
+ type != PTR_TO_MAP_VALUE &&
type != expected_type)
goto err_type;
meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -1401,7 +1459,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->key\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
meta->map_ptr->key_size);
else
@@ -1417,7 +1475,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
verbose("invalid map_ptr to access map->value\n");
return -EACCES;
}
- if (type == PTR_TO_PACKET)
+ if (type_is_pkt_pointer(type))
err = check_packet_access(env, regno, reg->off,
meta->map_ptr->value_size);
else
@@ -1590,8 +1648,8 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
return count > 1 ? -EINVAL : 0;
}
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
@@ -1600,18 +1658,15 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
int i;
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET ||
- regs[i].type == PTR_TO_PACKET_END)
+ if (reg_is_pkt_pointer_any(&regs[i]))
mark_reg_unknown(regs, i);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type != PTR_TO_PACKET &&
- reg->type != PTR_TO_PACKET_END)
- continue;
- __mark_reg_unknown(reg);
+ if (reg_is_pkt_pointer_any(reg))
+ __mark_reg_unknown(reg);
}
}
@@ -1871,7 +1926,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
dst_reg->range = 0;
@@ -1931,7 +1986,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
dst_reg->off = ptr_reg->off;
- if (ptr_reg->type == PTR_TO_PACKET) {
+ if (reg_is_pkt_pointer(ptr_reg)) {
dst_reg->id = ++env->id_gen;
/* something was added to pkt_ptr, set range to zero */
if (smin_val < 0)
@@ -2421,7 +2476,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
static void find_good_pkt_pointers(struct bpf_verifier_state *state,
- struct bpf_reg_state *dst_reg)
+ struct bpf_reg_state *dst_reg,
+ enum bpf_reg_type type)
{
struct bpf_reg_state *regs = state->regs, *reg;
int i;
@@ -2483,7 +2539,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
* dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
*/
for (i = 0; i < MAX_BPF_REG; i++)
- if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+ if (regs[i].type == type && regs[i].id == dst_reg->id)
/* keep the maximum range already checked */
regs[i].range = max_t(u16, regs[i].range, dst_reg->off);
@@ -2491,7 +2547,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
- if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+ if (reg->type == type && reg->id == dst_reg->id)
reg->range = max_t(u16, reg->range, dst_reg->off);
}
}
@@ -2856,19 +2912,39 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(this_branch, dst_reg);
+ find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
- find_good_pkt_pointers(other_branch, dst_reg);
+ find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
- find_good_pkt_pointers(other_branch, &regs[insn->src_reg]);
+ find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+ PTR_TO_PACKET);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
dst_reg->type == PTR_TO_PACKET_END &&
regs[insn->src_reg].type == PTR_TO_PACKET) {
- find_good_pkt_pointers(this_branch, &regs[insn->src_reg]);
+ find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+ PTR_TO_PACKET);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
+ dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+ find_good_pkt_pointers(this_branch, dst_reg, PTR_TO_PACKET_META);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
+ dst_reg->type == PTR_TO_PACKET_META &&
+ reg_is_init_pkt_pointer(&regs[insn->src_reg], PTR_TO_PACKET)) {
+ find_good_pkt_pointers(other_branch, dst_reg, PTR_TO_PACKET_META);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
+ reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+ find_good_pkt_pointers(other_branch, &regs[insn->src_reg],
+ PTR_TO_PACKET_META);
+ } else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
+ reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+ regs[insn->src_reg].type == PTR_TO_PACKET_META) {
+ find_good_pkt_pointers(this_branch, &regs[insn->src_reg],
+ PTR_TO_PACKET_META);
} else if (is_pointer_value(env, insn->dst_reg)) {
verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
return -EACCES;
@@ -2997,6 +3073,43 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static int check_return_code(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *reg;
+ struct tnum range = tnum_range(0, 1);
+
+ switch (env->prog->type) {
+ case BPF_PROG_TYPE_CGROUP_SKB:
+ case BPF_PROG_TYPE_CGROUP_SOCK:
+ case BPF_PROG_TYPE_SOCK_OPS:
+ break;
+ default:
+ return 0;
+ }
+
+ reg = &env->cur_state.regs[BPF_REG_0];
+ if (reg->type != SCALAR_VALUE) {
+ verbose("At program exit the register R0 is not a known value (%s)\n",
+ reg_type_str[reg->type]);
+ return -EINVAL;
+ }
+
+ if (!tnum_in(range, reg->var_off)) {
+ verbose("At program exit the register R0 ");
+ if (!tnum_is_unknown(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("has value %s", tn_buf);
+ } else {
+ verbose("has unknown scalar value");
+ }
+ verbose(" should have been 0 or 1\n");
+ return -EINVAL;
+ }
+ return 0;
+}
+
/* non-recursive DFS pseudo code
* 1 procedure DFS-iterative(G,v):
* 2 label v as discovered
@@ -3298,8 +3411,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
/* Check our ids match any regs they're supposed to */
return check_ids(rold->id, rcur->id, idmap);
+ case PTR_TO_PACKET_META:
case PTR_TO_PACKET:
- if (rcur->type != PTR_TO_PACKET)
+ if (rcur->type != rold->type)
return false;
/* We must have at least as much range as the old ptr
* did, so that any accesses which were safe before are
@@ -3786,6 +3900,9 @@ static int do_check(struct bpf_verifier_env *env)
return -EACCES;
}
+ err = check_return_code(env);
+ if (err)
+ return err;
process_bpf_exit:
insn_idx = pop_stack(env, &prev_insn_idx);
if (insn_idx < 0) {
@@ -4205,7 +4322,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
}
if (insn->imm == BPF_FUNC_redirect_map) {
- u64 addr = (unsigned long)prog;
+ /* Note, we cannot use prog directly as imm as subsequent
+ * rewrites would still change the prog pointer. The only
+ * stable address we can use is aux, which also works with
+ * prog clones during blinding.
+ */
+ u64 addr = (unsigned long)prog->aux;
struct bpf_insn r4_ld[] = {
BPF_LD_IMM64(BPF_REG_4, addr),
*insn,
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d6551cd45238..00f5b358aeac 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1896,6 +1896,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
if (ret)
goto destroy_root;
+ ret = cgroup_bpf_inherit(root_cgrp);
+ WARN_ON_ONCE(ret);
+
trace_cgroup_setup_root(root);
/*
@@ -2311,6 +2314,14 @@ out_release_tset:
list_del_init(&cset->mg_node);
}
spin_unlock_irq(&css_set_lock);
+
+ /*
+ * Re-initialize the cgroup_taskset structure in case it is reused
+ * again in another cgroup_migrate_add_task()/cgroup_migrate_execute()
+ * iteration.
+ */
+ tset->nr_tasks = 0;
+ tset->csets = &tset->src_csets;
return ret;
}
@@ -4713,6 +4724,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
cgrp->self.parent = &parent->self;
cgrp->root = root;
cgrp->level = level;
+ ret = cgroup_bpf_inherit(cgrp);
+ if (ret)
+ goto out_idr_free;
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4747,13 +4761,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
if (!cgroup_on_dfl(cgrp))
cgrp->subtree_control = cgroup_control(cgrp);
- if (parent)
- cgroup_bpf_inherit(cgrp, parent);
-
cgroup_propagate_control(cgrp);
return cgrp;
+out_idr_free:
+ cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5736,14 +5749,33 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+ enum bpf_attach_type type, u32 flags)
+{
+ int ret;
+
+ mutex_lock(&cgroup_mutex);
+ ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+ mutex_unlock(&cgroup_mutex);
+ return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
{
- struct cgroup *parent = cgroup_parent(cgrp);
int ret;
mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+ ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5308fad51..8de11a29e495 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -46,11 +46,13 @@
* @bringup: Single callback bringup or teardown selector
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
- * @done: Signal completion to the issuer of the task
+ * @done_up: Signal completion to the issuer of the task for cpu-up
+ * @done_down: Signal completion to the issuer of the task for cpu-down
*/
struct cpuhp_cpu_state {
enum cpuhp_state state;
enum cpuhp_state target;
+ enum cpuhp_state fail;
#ifdef CONFIG_SMP
struct task_struct *thread;
bool should_run;
@@ -58,18 +60,39 @@ struct cpuhp_cpu_state {
bool single;
bool bringup;
struct hlist_node *node;
+ struct hlist_node *last;
enum cpuhp_state cb_state;
int result;
- struct completion done;
+ struct completion done_up;
+ struct completion done_down;
#endif
};
-static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state);
+static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
+ .fail = CPUHP_INVALID,
+};
#if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
-static struct lock_class_key cpuhp_state_key;
-static struct lockdep_map cpuhp_state_lock_map =
- STATIC_LOCKDEP_MAP_INIT("cpuhp_state", &cpuhp_state_key);
+static struct lockdep_map cpuhp_state_up_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", &cpuhp_state_up_map);
+static struct lockdep_map cpuhp_state_down_map =
+ STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map);
+
+
+static void inline cpuhp_lock_acquire(bool bringup)
+{
+ lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+
+static void inline cpuhp_lock_release(bool bringup)
+{
+ lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map);
+}
+#else
+
+static void inline cpuhp_lock_acquire(bool bringup) { }
+static void inline cpuhp_lock_release(bool bringup) { }
+
#endif
/**
@@ -123,13 +146,16 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
/**
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
- * @step: The step in the state machine
+ * @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
+ * @node: For multi-instance, do a single entry callback for install/remove
+ * @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
- bool bringup, struct hlist_node *node)
+ bool bringup, struct hlist_node *node,
+ struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state);
@@ -137,7 +163,17 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
int (*cb)(unsigned int cpu);
int ret, cnt;
+ if (st->fail == state) {
+ st->fail = CPUHP_INVALID;
+
+ if (!(bringup ? step->startup.single : step->teardown.single))
+ return 0;
+
+ return -EAGAIN;
+ }
+
if (!step->multi_instance) {
+ WARN_ON_ONCE(lastp && *lastp);
cb = bringup ? step->startup.single : step->teardown.single;
if (!cb)
return 0;
@@ -152,6 +188,7 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* Single invocation for instance add/remove */
if (node) {
+ WARN_ON_ONCE(lastp && *lastp);
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
@@ -161,13 +198,23 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
/* State transition. Invoke on all instances */
cnt = 0;
hlist_for_each(node, &step->list) {
+ if (lastp && node == *lastp)
+ break;
+
trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
ret = cbm(cpu, node);
trace_cpuhp_exit(cpu, st->state, state, ret);
- if (ret)
- goto err;
+ if (ret) {
+ if (!lastp)
+ goto err;
+
+ *lastp = node;
+ return ret;
+ }
cnt++;
}
+ if (lastp)
+ *lastp = NULL;
return 0;
err:
/* Rollback the instances if one failed */
@@ -178,12 +225,39 @@ err:
hlist_for_each(node, &step->list) {
if (!cnt--)
break;
- cbm(cpu, node);
+
+ trace_cpuhp_multi_enter(cpu, st->target, state, cbm, node);
+ ret = cbm(cpu, node);
+ trace_cpuhp_exit(cpu, st->state, state, ret);
+ /*
+ * Rollback must not fail,
+ */
+ WARN_ON_ONCE(ret);
}
return ret;
}
#ifdef CONFIG_SMP
+static inline void wait_for_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ wait_for_completion(done);
+}
+
+static inline void complete_ap_thread(struct cpuhp_cpu_state *st, bool bringup)
+{
+ struct completion *done = bringup ? &st->done_up : &st->done_down;
+ complete(done);
+}
+
+/*
+ * The former STARTING/DYING states, ran with IRQs disabled and must not fail.
+ */
+static bool cpuhp_is_atomic_state(enum cpuhp_state state)
+{
+ return CPUHP_AP_IDLE_DEAD <= state && state < CPUHP_AP_ONLINE;
+}
+
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
static DEFINE_MUTEX(cpu_add_remove_lock);
bool cpuhp_tasks_frozen;
@@ -271,14 +345,79 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+static inline enum cpuhp_state
+cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+
+ st->rollback = false;
+ st->last = NULL;
+
+ st->target = target;
+ st->single = false;
+ st->bringup = st->state < target;
+
+ return prev_state;
+}
+
+static inline void
+cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
+{
+ st->rollback = true;
+
+ /*
+ * If we have st->last we need to undo partial multi_instance of this
+ * state first. Otherwise start undo at the previous state.
+ */
+ if (!st->last) {
+ if (st->bringup)
+ st->state--;
+ else
+ st->state++;
+ }
+
+ st->target = prev_state;
+ st->bringup = !st->bringup;
+}
+
+/* Regular hotplug invocation of the AP hotplug thread */
+static void __cpuhp_kick_ap(struct cpuhp_cpu_state *st)
+{
+ if (!st->single && st->state == st->target)
+ return;
+
+ st->result = 0;
+ /*
+ * Make sure the above stores are visible before should_run becomes
+ * true. Paired with the mb() above in cpuhp_thread_fun()
+ */
+ smp_mb();
+ st->should_run = true;
+ wake_up_process(st->thread);
+ wait_for_ap_thread(st, st->bringup);
+}
+
+static int cpuhp_kick_ap(struct cpuhp_cpu_state *st, enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state;
+ int ret;
+
+ prev_state = cpuhp_set_state(st, target);
+ __cpuhp_kick_ap(st);
+ if ((ret = st->result)) {
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
+}
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
/* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, true);
if (WARN_ON_ONCE((!cpu_online(cpu))))
return -ECANCELED;
@@ -286,12 +425,10 @@ static int bringup_wait_for_ap(unsigned int cpu)
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE) {
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- }
- return st->result;
+ if (st->target <= CPUHP_AP_ONLINE_IDLE)
+ return 0;
+
+ return cpuhp_kick_ap(st, st->target);
}
static int bringup_cpu(unsigned int cpu)
@@ -317,32 +454,6 @@ static int bringup_cpu(unsigned int cpu)
/*
* Hotplug state machine related functions
*/
-static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- for (st->state++; st->state < st->target; st->state++) {
- struct cpuhp_step *step = cpuhp_get_step(st->state);
-
- if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
- }
-}
-
-static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
- enum cpuhp_state target)
-{
- enum cpuhp_state prev_state = st->state;
- int ret = 0;
-
- for (; st->state > target; st->state--) {
- ret = cpuhp_invoke_callback(cpu, st->state, false, NULL);
- if (ret) {
- st->target = prev_state;
- undo_cpu_down(cpu, st);
- break;
- }
- }
- return ret;
-}
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
{
@@ -350,7 +461,7 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
struct cpuhp_step *step = cpuhp_get_step(st->state);
if (!step->skip_onerr)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
}
}
@@ -362,7 +473,7 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
while (st->state < target) {
st->state++;
- ret = cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
st->target = prev_state;
undo_cpu_up(cpu, st);
@@ -379,7 +490,8 @@ static void cpuhp_create(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- init_completion(&st->done);
+ init_completion(&st->done_up);
+ init_completion(&st->done_down);
}
static int cpuhp_should_run(unsigned int cpu)
@@ -389,69 +501,90 @@ static int cpuhp_should_run(unsigned int cpu)
return st->should_run;
}
-/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */
-static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU);
-
- return cpuhp_down_callbacks(cpu, st, target);
-}
-
-/* Execute the online startup callbacks. Used to be CPU_ONLINE */
-static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st)
-{
- return cpuhp_up_callbacks(cpu, st, st->target);
-}
-
/*
* Execute teardown/startup callbacks on the plugged cpu. Also used to invoke
* callbacks when a state gets [un]installed at runtime.
+ *
+ * Each invocation of this function by the smpboot thread does a single AP
+ * state callback.
+ *
+ * It has 3 modes of operation:
+ * - single: runs st->cb_state
+ * - up: runs ++st->state, while st->state < st->target
+ * - down: runs st->state--, while st->state > st->target
+ *
+ * When complete or on error, should_run is cleared and the completion is fired.
*/
static void cpuhp_thread_fun(unsigned int cpu)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- int ret = 0;
+ bool bringup = st->bringup;
+ enum cpuhp_state state;
/*
- * Paired with the mb() in cpuhp_kick_ap_work and
- * cpuhp_invoke_ap_callback, so the work set is consistent visible.
+ * ACQUIRE for the cpuhp_should_run() load of ->should_run. Ensures
+ * that if we see ->should_run we also see the rest of the state.
*/
smp_mb();
- if (!st->should_run)
+
+ if (WARN_ON_ONCE(!st->should_run))
return;
- st->should_run = false;
+ cpuhp_lock_acquire(bringup);
- lock_map_acquire(&cpuhp_state_lock_map);
- /* Single callback invocation for [un]install ? */
if (st->single) {
- if (st->cb_state < CPUHP_AP_ONLINE) {
- local_irq_disable();
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
- local_irq_enable();
+ state = st->cb_state;
+ st->should_run = false;
+ } else {
+ if (bringup) {
+ st->state++;
+ state = st->state;
+ st->should_run = (st->state < st->target);
+ WARN_ON_ONCE(st->state > st->target);
} else {
- ret = cpuhp_invoke_callback(cpu, st->cb_state,
- st->bringup, st->node);
+ state = st->state;
+ st->state--;
+ st->should_run = (st->state > st->target);
+ WARN_ON_ONCE(st->state < st->target);
}
- } else if (st->rollback) {
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
+ }
+
+ WARN_ON_ONCE(!cpuhp_is_ap_state(state));
- undo_cpu_down(cpu, st);
- st->rollback = false;
+ if (st->rollback) {
+ struct cpuhp_step *step = cpuhp_get_step(state);
+ if (step->skip_onerr)
+ goto next;
+ }
+
+ if (cpuhp_is_atomic_state(state)) {
+ local_irq_disable();
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ local_irq_enable();
+
+ /*
+ * STARTING/DYING must not fail!
+ */
+ WARN_ON_ONCE(st->result);
} else {
- /* Cannot happen .... */
- BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE);
-
- /* Regular hotplug work */
- if (st->state < st->target)
- ret = cpuhp_ap_online(cpu, st);
- else if (st->state > st->target)
- ret = cpuhp_ap_offline(cpu, st);
+ st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
+ }
+
+ if (st->result) {
+ /*
+ * If we fail on a rollback, we're up a creek without no
+ * paddle, no way forward, no way back. We loose, thanks for
+ * playing.
+ */
+ WARN_ON_ONCE(st->rollback);
+ st->should_run = false;
}
- lock_map_release(&cpuhp_state_lock_map);
- st->result = ret;
- complete(&st->done);
+
+next:
+ cpuhp_lock_release(bringup);
+
+ if (!st->should_run)
+ complete_ap_thread(st, bringup);
}
/* Invoke a single callback on a remote cpu */
@@ -460,62 +593,64 @@ cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, bool bringup,
struct hlist_node *node)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ int ret;
if (!cpu_online(cpu))
return 0;
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
/*
* If we are up and running, use the hotplug thread. For early calls
* we invoke the thread function directly.
*/
if (!st->thread)
- return cpuhp_invoke_callback(cpu, state, bringup, node);
+ return cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
+ st->rollback = false;
+ st->last = NULL;
+
+ st->node = node;
+ st->bringup = bringup;
st->cb_state = state;
st->single = true;
- st->bringup = bringup;
- st->node = node;
- /*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
- */
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
- wait_for_completion(&st->done);
- return st->result;
-}
+ __cpuhp_kick_ap(st);
-/* Regular hotplug invocation of the AP hotplug thread */
-static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st)
-{
- st->result = 0;
- st->single = false;
/*
- * Make sure the above stores are visible before should_run becomes
- * true. Paired with the mb() above in cpuhp_thread_fun()
+ * If we failed and did a partial, do a rollback.
*/
- smp_mb();
- st->should_run = true;
- wake_up_process(st->thread);
+ if ((ret = st->result) && st->last) {
+ st->rollback = true;
+ st->bringup = !bringup;
+
+ __cpuhp_kick_ap(st);
+ }
+
+ return ret;
}
static int cpuhp_kick_ap_work(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
- enum cpuhp_state state = st->state;
+ enum cpuhp_state prev_state = st->state;
+ int ret;
+
+ cpuhp_lock_acquire(false);
+ cpuhp_lock_release(false);
+
+ cpuhp_lock_acquire(true);
+ cpuhp_lock_release(true);
+
+ trace_cpuhp_enter(cpu, st->target, prev_state, cpuhp_kick_ap_work);
+ ret = cpuhp_kick_ap(st, st->target);
+ trace_cpuhp_exit(cpu, st->state, prev_state, ret);
- trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work);
- lock_map_acquire(&cpuhp_state_lock_map);
- lock_map_release(&cpuhp_state_lock_map);
- __cpuhp_kick_ap_work(st);
- wait_for_completion(&st->done);
- trace_cpuhp_exit(cpu, st->state, state, st->result);
- return st->result;
+ return ret;
}
static struct smp_hotplug_thread cpuhp_threads = {
@@ -581,6 +716,7 @@ static int take_cpu_down(void *_param)
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE);
int err, cpu = smp_processor_id();
+ int ret;
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
@@ -594,8 +730,13 @@ static int take_cpu_down(void *_param)
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
st->state--;
/* Invoke the former CPU_DYING callbacks */
- for (; st->state > target; st->state--)
- cpuhp_invoke_callback(cpu, st->state, false, NULL);
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ /*
+ * DYING must not fail!
+ */
+ WARN_ON_ONCE(ret);
+ }
/* Give up timekeeping duties */
tick_handover_do_timer();
@@ -639,7 +780,7 @@ static int takedown_cpu(unsigned int cpu)
*
* Wait for the stop thread to go away.
*/
- wait_for_completion(&st->done);
+ wait_for_ap_thread(st, false);
BUG_ON(st->state != CPUHP_AP_IDLE_DEAD);
/* Interrupts are moved away from the dying cpu, reenable alloc/free */
@@ -658,7 +799,7 @@ static void cpuhp_complete_idle_dead(void *arg)
{
struct cpuhp_cpu_state *st = arg;
- complete(&st->done);
+ complete_ap_thread(st, false);
}
void cpuhp_report_idle_dead(void)
@@ -676,11 +817,32 @@ void cpuhp_report_idle_dead(void)
cpuhp_complete_idle_dead, st, 0);
}
-#else
-#define takedown_cpu NULL
-#endif
+static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
+{
+ for (st->state++; st->state < st->target; st->state++) {
+ struct cpuhp_step *step = cpuhp_get_step(st->state);
-#ifdef CONFIG_HOTPLUG_CPU
+ if (!step->skip_onerr)
+ cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ }
+}
+
+static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
+ enum cpuhp_state target)
+{
+ enum cpuhp_state prev_state = st->state;
+ int ret = 0;
+
+ for (; st->state > target; st->state--) {
+ ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
+ if (ret) {
+ st->target = prev_state;
+ undo_cpu_down(cpu, st);
+ break;
+ }
+ }
+ return ret;
+}
/* Requires cpu_add_remove_lock to be held */
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
@@ -699,13 +861,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
cpuhp_tasks_frozen = tasks_frozen;
- prev_state = st->state;
- st->target = target;
+ prev_state = cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread.
*/
if (st->state > CPUHP_TEARDOWN_CPU) {
+ st->target = max((int)target, CPUHP_TEARDOWN_CPU);
ret = cpuhp_kick_ap_work(cpu);
/*
* The AP side has done the error rollback already. Just
@@ -720,6 +882,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
+
+ st->target = target;
}
/*
* The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need
@@ -727,9 +891,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
*/
ret = cpuhp_down_callbacks(cpu, st, target);
if (ret && st->state > CPUHP_TEARDOWN_CPU && st->state < prev_state) {
- st->target = prev_state;
- st->rollback = true;
- cpuhp_kick_ap_work(cpu);
+ cpuhp_reset_state(st, prev_state);
+ __cpuhp_kick_ap(st);
}
out:
@@ -754,11 +917,15 @@ out:
cpu_maps_update_done();
return err;
}
+
int cpu_down(unsigned int cpu)
{
return do_cpu_down(cpu, CPUHP_OFFLINE);
}
EXPORT_SYMBOL(cpu_down);
+
+#else
+#define takedown_cpu NULL
#endif /*CONFIG_HOTPLUG_CPU*/
/**
@@ -772,11 +939,16 @@ void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
+ int ret;
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
while (st->state < target) {
st->state++;
- cpuhp_invoke_callback(cpu, st->state, true, NULL);
+ ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
+ /*
+ * STARTING must not fail!
+ */
+ WARN_ON_ONCE(ret);
}
}
@@ -794,7 +966,7 @@ void cpuhp_online_idle(enum cpuhp_state state)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
- complete(&st->done);
+ complete_ap_thread(st, true);
}
/* Requires cpu_add_remove_lock to be held */
@@ -829,7 +1001,7 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
cpuhp_tasks_frozen = tasks_frozen;
- st->target = target;
+ cpuhp_set_state(st, target);
/*
* If the current CPU state is in the range of the AP hotplug thread,
* then we need to kick the thread once more.
@@ -1296,6 +1468,10 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
struct cpuhp_step *sp = cpuhp_get_step(state);
int ret;
+ /*
+ * If there's nothing to do, we done.
+ * Relies on the union for multi_instance.
+ */
if ((bringup && !sp->startup.single) ||
(!bringup && !sp->teardown.single))
return 0;
@@ -1307,9 +1483,9 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
if (cpuhp_is_ap_state(state))
ret = cpuhp_invoke_ap_callback(cpu, state, bringup, node);
else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#else
- ret = cpuhp_invoke_callback(cpu, state, bringup, node);
+ ret = cpuhp_invoke_callback(cpu, state, bringup, node, NULL);
#endif
BUG_ON(ret && !bringup);
return ret;
@@ -1641,9 +1817,55 @@ static ssize_t show_cpuhp_target(struct device *dev,
}
static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
+
+static ssize_t write_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+ struct cpuhp_step *sp;
+ int fail, ret;
+
+ ret = kstrtoint(buf, 10, &fail);
+ if (ret)
+ return ret;
+
+ /*
+ * Cannot fail STARTING/DYING callbacks.
+ */
+ if (cpuhp_is_atomic_state(fail))
+ return -EINVAL;
+
+ /*
+ * Cannot fail anything that doesn't have callbacks.
+ */
+ mutex_lock(&cpuhp_state_mutex);
+ sp = cpuhp_get_step(fail);
+ if (!sp->startup.single && !sp->teardown.single)
+ ret = -EINVAL;
+ mutex_unlock(&cpuhp_state_mutex);
+ if (ret)
+ return ret;
+
+ st->fail = fail;
+
+ return count;
+}
+
+static ssize_t show_cpuhp_fail(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
+
+ return sprintf(buf, "%d\n", st->fail);
+}
+
+static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
&dev_attr_target.attr,
+ &dev_attr_fail.attr,
NULL
};
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 3e691b75b2db..6bc21e202ae4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -8171,6 +8171,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
}
}
event->tp_event->prog = prog;
+ event->tp_event->bpf_prog_owner = event;
return 0;
}
@@ -8185,7 +8186,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
return;
prog = event->tp_event->prog;
- if (prog) {
+ if (prog && event->tp_event->bpf_prog_owner == event) {
event->tp_event->prog = NULL;
bpf_prog_put(prog);
}
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index af71a84e12ee..f684d8e5fa2b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -412,6 +412,19 @@ err:
return NULL;
}
+static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
+{
+ if (rb->aux_overwrite)
+ return false;
+
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Commit the data written by hardware into the ring buffer by adjusting
* aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the
@@ -451,10 +464,8 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
}
rb->user_page->aux_head = rb->aux_head;
- if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ if (rb_need_aux_wakeup(rb))
wakeup = true;
- rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
- }
if (wakeup) {
if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)
@@ -484,9 +495,8 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
rb->aux_head += size;
rb->user_page->aux_head = rb->aux_head;
- if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
+ if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
- rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 3481ababd06a..f2cd53e92147 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1600,12 +1600,10 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
struct waitid_info info = {.status = 0};
long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
int signo = 0;
+
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err) {
if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
return -EFAULT;
}
@@ -1723,16 +1721,15 @@ COMPAT_SYSCALL_DEFINE5(waitid,
if (err > 0) {
signo = SIGCHLD;
err = 0;
- }
-
- if (!err && uru) {
- /* kernel_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- err = copy_to_user(uru, &ru, sizeof(ru));
- else
- err = put_compat_rusage(&ru, uru);
- if (err)
- return -EFAULT;
+ if (uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
}
if (!infop)
diff --git a/kernel/extable.c b/kernel/extable.c
index 38c2412401a1..9aa1cc41ecf7 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -102,15 +102,7 @@ int core_kernel_data(unsigned long addr)
int __kernel_text_address(unsigned long addr)
{
- if (core_kernel_text(addr))
- return 1;
- if (is_module_text_address(addr))
- return 1;
- if (is_ftrace_trampoline(addr))
- return 1;
- if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
- if (is_bpf_text_address(addr))
+ if (kernel_text_address(addr))
return 1;
/*
* There might be init symbols in saved stacktraces.
@@ -127,17 +119,42 @@ int __kernel_text_address(unsigned long addr)
int kernel_text_address(unsigned long addr)
{
+ bool no_rcu;
+ int ret = 1;
+
if (core_kernel_text(addr))
return 1;
+
+ /*
+ * If a stack dump happens while RCU is not watching, then
+ * RCU needs to be notified that it requires to start
+ * watching again. This can happen either by tracing that
+ * triggers a stack trace, or a WARN() that happens during
+ * coming back from idle, or cpu on or offlining.
+ *
+ * is_module_text_address() as well as the kprobe slots
+ * and is_bpf_text_address() require RCU to be watching.
+ */
+ no_rcu = !rcu_is_watching();
+
+ /* Treat this like an NMI as it can happen anywhere */
+ if (no_rcu)
+ rcu_nmi_enter();
+
if (is_module_text_address(addr))
- return 1;
+ goto out;
if (is_ftrace_trampoline(addr))
- return 1;
+ goto out;
if (is_kprobe_optinsn_slot(addr) || is_kprobe_insn_slot(addr))
- return 1;
+ goto out;
if (is_bpf_text_address(addr))
- return 1;
- return 0;
+ goto out;
+ ret = 0;
+out:
+ if (no_rcu)
+ rcu_nmi_exit();
+
+ return ret;
}
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..e702cb9ffbd8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -946,6 +946,24 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+#ifdef CONFIG_MMU
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct,
+ async_put_work);
+
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+#endif
+
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
diff --git a/kernel/futex.c b/kernel/futex.c
index 3d38eaf05492..0518a0bfc746 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -821,8 +821,6 @@ static void get_pi_state(struct futex_pi_state *pi_state)
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
- *
- * Must be called with the hb lock held.
*/
static void put_pi_state(struct futex_pi_state *pi_state)
{
@@ -837,16 +835,22 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ struct task_struct *owner;
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ owner = pi_state->owner;
+ if (owner) {
+ raw_spin_lock(&owner->pi_lock);
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&owner->pi_lock);
+ }
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
}
- if (current->pi_state_cache)
+ if (current->pi_state_cache) {
kfree(pi_state);
- else {
+ } else {
/*
* pi_state->list is already empty.
* clear pi_state->owner.
@@ -907,13 +911,14 @@ void exit_pi_state_list(struct task_struct *curr)
raw_spin_unlock_irq(&curr->pi_lock);
spin_lock(&hb->lock);
-
- raw_spin_lock_irq(&curr->pi_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock(&curr->pi_lock);
/*
* We dropped the pi-lock, so re-check whether this
* task still owns the PI-state:
*/
if (head->next != next) {
+ raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
continue;
}
@@ -922,9 +927,10 @@ void exit_pi_state_list(struct task_struct *curr)
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
pi_state->owner = NULL;
- raw_spin_unlock_irq(&curr->pi_lock);
+ raw_spin_unlock(&curr->pi_lock);
get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
rt_mutex_futex_unlock(&pi_state->pi_mutex);
@@ -1208,6 +1214,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
+ /*
+ * Assignment without holding pi_state->pi_mutex.wait_lock is safe
+ * because there is no concurrency as the object is not published yet.
+ */
pi_state->owner = p;
raw_spin_unlock_irq(&p->pi_lock);
@@ -2878,6 +2888,7 @@ retry:
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
+ /* drops pi_state->pi_mutex.wait_lock */
ret = wake_futex_pi(uaddr, uval, pi_state);
put_pi_state(pi_state);
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f51b7b6d2451..6fc89fd93824 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -202,7 +202,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
irqd_clr_managed_shutdown(d);
- if (cpumask_any_and(aff, cpu_online_mask) > nr_cpu_ids) {
+ if (cpumask_any_and(aff, cpu_online_mask) >= nr_cpu_ids) {
/*
* Catch code which fiddles with enable_irq() on a managed
* and potentially shutdown IRQ. Chained interrupt
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index f7086b78ad6e..5270a54b9fa4 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -322,7 +322,6 @@ int __irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
/* Calc pointer to the next generic chip */
tmp += sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
}
- d->name = name;
return 0;
}
EXPORT_SYMBOL_GPL(__irq_alloc_domain_generic_chips);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index e84b7056bb08..ac4644e92b49 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -945,7 +945,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
struct irq_desc *desc;
struct irq_domain *domain;
struct radix_tree_iter iter;
- void **slot;
+ void __rcu **slot;
int i;
seq_printf(m, " %-16s %-6s %-10s %-10s %s\n",
@@ -1453,7 +1453,7 @@ out_free_desc:
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)
{
- void **slot;
+ void __rcu **slot;
if (d->hwirq < d->domain->revmap_size)
return; /* Not using radix tree. */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 573dc52b0806..d00132b5c325 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1643,6 +1643,10 @@ const void *free_irq(unsigned int irq, void *dev_id)
#endif
action = __free_irq(irq, dev_id);
+
+ if (!action)
+ return NULL;
+
devname = action->name;
kfree(action);
return devname;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index ea34ed8bb952..055bb2962a0b 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -131,7 +131,7 @@ static int kcmp_epoll_target(struct task_struct *task1,
if (filp_epoll) {
filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
fput(filp_epoll);
- } else
+ }
if (IS_ERR(filp_tgt))
return PTR_ERR(filp_tgt);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f660666ab8..1fefe6dcafd7 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
/*
+ * __rwsem_down_write_failed_common(sem)
+ * rwsem_optimistic_spin(sem)
+ * osq_unlock(sem->osq)
+ * ...
+ * atomic_long_add_return(&sem->count)
+ *
+ * - VS -
+ *
+ * __up_write()
+ * if (atomic_long_sub_return_release(&sem->count) < 0)
+ * rwsem_wake(sem)
+ * osq_is_locked(&sem->osq)
+ *
+ * And __up_write() must observe !osq_is_locked() when it observes the
+ * atomic_long_add_return() in order to not miss a wakeup.
+ *
+ * This boils down to:
+ *
+ * [S.rel] X = 1 [RmW] r0 = (Y += 0)
+ * MB RMB
+ * [RmW] Y += 1 [L] r1 = X
+ *
+ * exists (r0=1 /\ r1=0)
+ */
+ smp_rmb();
+
+ /*
* If a spinner is present, it is not necessary to do the wakeup.
* Try to do wakeup only if the trylock succeeds to minimize
* spinlock contention which may introduce too much delay in the
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 6bcbfbf1a8fd..403ab9cdb949 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -350,7 +350,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
pgprot_t pgprot = PAGE_KERNEL;
struct dev_pagemap *pgmap;
struct page_map *page_map;
- int error, nid, is_ram;
+ int error, nid, is_ram, i = 0;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
@@ -448,6 +448,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
list_del(&page->lru);
page->pgmap = pgmap;
percpu_ref_get(ref);
+ if (!(++i % 1024))
+ cond_resched();
}
devres_add(dev, page_map);
return __va(res->start);
diff --git a/kernel/params.c b/kernel/params.c
index 60b2d8101355..cc9108c2a1fd 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -224,7 +224,7 @@ char *parse_args(const char *doing,
} \
int param_get_##name(char *buffer, const struct kernel_param *kp) \
{ \
- return scnprintf(buffer, PAGE_SIZE, format, \
+ return scnprintf(buffer, PAGE_SIZE, format "\n", \
*((type *)kp->arg)); \
} \
const struct kernel_param_ops param_ops_##name = { \
@@ -236,14 +236,14 @@ char *parse_args(const char *doing,
EXPORT_SYMBOL(param_ops_##name)
-STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
-STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
-STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
-STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
-STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
-STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
-STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
-STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
+STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", kstrtou8);
+STANDARD_PARAM_DEF(short, short, "%hi", kstrtos16);
+STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", kstrtou16);
+STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
+STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
+STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
+STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
@@ -270,7 +270,7 @@ EXPORT_SYMBOL(param_set_charp);
int param_get_charp(char *buffer, const struct kernel_param *kp)
{
- return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg));
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", *((char **)kp->arg));
}
EXPORT_SYMBOL(param_get_charp);
@@ -301,7 +301,7 @@ EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
@@ -360,7 +360,7 @@ EXPORT_SYMBOL(param_set_invbool);
int param_get_invbool(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y');
+ return sprintf(buffer, "%c\n", (*(bool *)kp->arg) ? 'N' : 'Y');
}
EXPORT_SYMBOL(param_get_invbool);
@@ -460,8 +460,9 @@ static int param_array_get(char *buffer, const struct kernel_param *kp)
struct kernel_param p = *kp;
for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) {
+ /* Replace \n with comma */
if (i)
- buffer[off++] = ',';
+ buffer[off - 1] = ',';
p.arg = arr->elem + arr->elemsize * i;
check_kparam_locked(p.mod);
ret = arr->ops->get(buffer + off, &p);
@@ -507,7 +508,7 @@ EXPORT_SYMBOL(param_set_copystring);
int param_get_string(char *buffer, const struct kernel_param *kp)
{
const struct kparam_string *kps = kp->str;
- return strlcpy(buffer, kps->string, kps->maxlen);
+ return scnprintf(buffer, PAGE_SIZE, "%s\n", kps->string);
}
EXPORT_SYMBOL(param_get_string);
@@ -549,10 +550,6 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
kernel_param_lock(mk->mod);
count = attribute->param->ops->get(buf, attribute->param);
kernel_param_unlock(mk->mod);
- if (count > 0) {
- strcat(buf, "\n");
- ++count;
- }
return count;
}
@@ -600,7 +597,7 @@ EXPORT_SYMBOL(kernel_param_unlock);
/*
* add_sysfs_param - add a parameter to sysfs
* @mk: struct module_kobject
- * @kparam: the actual parameter definition to add to sysfs
+ * @kp: the actual parameter definition to add to sysfs
* @name: name of parameter
*
* Create a kobject if for a (per-module) parameter if mp NULL, and
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3e2b4f519009..ccd2d20e6b06 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -120,22 +120,26 @@ static void s2idle_loop(void)
* frozen processes + suspended devices + idle processors.
* Thus s2idle_enter() should be called right after
* all devices have been suspended.
+ *
+ * Wakeups during the noirq suspend of devices may be spurious,
+ * so prevent them from terminating the loop right away.
*/
error = dpm_noirq_suspend_devices(PMSG_SUSPEND);
if (!error)
s2idle_enter();
+ else if (error == -EBUSY && pm_wakeup_pending())
+ error = 0;
- dpm_noirq_resume_devices(PMSG_RESUME);
- if (error && (error != -EBUSY || !pm_wakeup_pending())) {
- dpm_noirq_end();
- break;
- }
-
- if (s2idle_ops && s2idle_ops->wake)
+ if (!error && s2idle_ops && s2idle_ops->wake)
s2idle_ops->wake();
+ dpm_noirq_resume_devices(PMSG_RESUME);
+
dpm_noirq_end();
+ if (error)
+ break;
+
if (s2idle_ops && s2idle_ops->sync)
s2idle_ops->sync();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1250e4bd4b85..b0ad62b0e7b8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -882,6 +882,11 @@ void rcu_irq_exit(void)
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
rdtp->dynticks_nesting < 1);
if (rdtp->dynticks_nesting <= 1) {
@@ -1015,6 +1020,11 @@ void rcu_irq_enter(void)
RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (rdtp->dynticks_nmi_nesting)
+ return;
+
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 18a6966567da..d17c5da523a0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5166,6 +5166,28 @@ void sched_show_task(struct task_struct *p)
put_task_stack(p);
}
+static inline bool
+state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
+ * TASK_KILLABLE).
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
+
void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
@@ -5188,7 +5210,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
- if (!state_filter || (p->state & state_filter))
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 01217fb5a5de..2f93e4a2d9f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -466,8 +466,6 @@ static char *task_group_path(struct task_group *tg)
}
#endif
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 98b59b5db90b..bb3a38005b9c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -17,11 +17,13 @@
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
+#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/seccomp.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
+#include <linux/sysctl.h>
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
@@ -42,6 +44,7 @@
* get/put helpers should be used when accessing an instance
* outside of a lifetime-guarded section. In general, this
* is only needed for handling filters shared across tasks.
+ * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
* @prev: points to a previously installed, or inherited, filter
* @prog: the BPF program to evaluate
*
@@ -57,6 +60,7 @@
*/
struct seccomp_filter {
refcount_t usage;
+ bool log;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
@@ -171,10 +175,15 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
/**
* seccomp_run_filters - evaluates all seccomp filters against @sd
* @sd: optional seccomp data to be passed to filters
+ * @match: stores struct seccomp_filter that resulted in the return value,
+ * unless filter returned SECCOMP_RET_ALLOW, in which case it will
+ * be unchanged.
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(const struct seccomp_data *sd)
+#define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
+static u32 seccomp_run_filters(const struct seccomp_data *sd,
+ struct seccomp_filter **match)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
@@ -184,7 +193,7 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
- return SECCOMP_RET_KILL;
+ return SECCOMP_RET_KILL_PROCESS;
if (!sd) {
populate_seccomp_data(&sd_local);
@@ -198,8 +207,10 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd)
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
- if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
+ if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
+ *match = f;
+ }
}
return ret;
}
@@ -444,6 +455,10 @@ static long seccomp_attach_filter(unsigned int flags,
return ret;
}
+ /* Set log flag, if present. */
+ if (flags & SECCOMP_FILTER_FLAG_LOG)
+ filter->log = true;
+
/*
* If there is an existing filter, make it the prev and don't drop its
* task reference.
@@ -458,14 +473,19 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
+void __get_seccomp_filter(struct seccomp_filter *filter)
+{
+ /* Reference count is bounded by the number of total processes. */
+ refcount_inc(&filter->usage);
+}
+
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
if (!orig)
return;
- /* Reference count is bounded by the number of total processes. */
- refcount_inc(&orig->usage);
+ __get_seccomp_filter(orig);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -476,10 +496,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
}
}
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
+static void __put_seccomp_filter(struct seccomp_filter *orig)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
@@ -488,6 +506,12 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ __put_seccomp_filter(tsk->seccomp.filter);
+}
+
static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
{
memset(info, 0, sizeof(*info));
@@ -514,6 +538,65 @@ static void seccomp_send_sigsys(int syscall, int reason)
}
#endif /* CONFIG_SECCOMP_FILTER */
+/* For use with seccomp_actions_logged */
+#define SECCOMP_LOG_KILL_PROCESS (1 << 0)
+#define SECCOMP_LOG_KILL_THREAD (1 << 1)
+#define SECCOMP_LOG_TRAP (1 << 2)
+#define SECCOMP_LOG_ERRNO (1 << 3)
+#define SECCOMP_LOG_TRACE (1 << 4)
+#define SECCOMP_LOG_LOG (1 << 5)
+#define SECCOMP_LOG_ALLOW (1 << 6)
+
+static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
+ SECCOMP_LOG_KILL_THREAD |
+ SECCOMP_LOG_TRAP |
+ SECCOMP_LOG_ERRNO |
+ SECCOMP_LOG_TRACE |
+ SECCOMP_LOG_LOG;
+
+static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
+ bool requested)
+{
+ bool log = false;
+
+ switch (action) {
+ case SECCOMP_RET_ALLOW:
+ break;
+ case SECCOMP_RET_TRAP:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
+ break;
+ case SECCOMP_RET_ERRNO:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
+ break;
+ case SECCOMP_RET_TRACE:
+ log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
+ break;
+ case SECCOMP_RET_LOG:
+ log = seccomp_actions_logged & SECCOMP_LOG_LOG;
+ break;
+ case SECCOMP_RET_KILL_THREAD:
+ log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
+ break;
+ case SECCOMP_RET_KILL_PROCESS:
+ default:
+ log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
+ }
+
+ /*
+ * Force an audit message to be emitted when the action is RET_KILL_*,
+ * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
+ * allowed to be logged by the admin.
+ */
+ if (log)
+ return __audit_seccomp(syscall, signr, action);
+
+ /*
+ * Let the audit subsystem decide if the action should be audited based
+ * on whether the current task itself is being audited.
+ */
+ return audit_seccomp(syscall, signr, action);
+}
+
/*
* Secure computing mode 1 allows only read/write/exit/sigreturn.
* To be fully secure this must be combined with rlimit
@@ -539,7 +622,7 @@ static void __secure_computing_strict(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
- audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
+ seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
@@ -566,6 +649,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
const bool recheck_after_trace)
{
u32 filter_ret, action;
+ struct seccomp_filter *match = NULL;
int data;
/*
@@ -574,9 +658,9 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
*/
rmb();
- filter_ret = seccomp_run_filters(sd);
+ filter_ret = seccomp_run_filters(sd, &match);
data = filter_ret & SECCOMP_RET_DATA;
- action = filter_ret & SECCOMP_RET_ACTION;
+ action = filter_ret & SECCOMP_RET_ACTION_FULL;
switch (action) {
case SECCOMP_RET_ERRNO:
@@ -637,14 +721,25 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
+ case SECCOMP_RET_LOG:
+ seccomp_log(this_syscall, 0, action, true);
+ return 0;
+
case SECCOMP_RET_ALLOW:
+ /*
+ * Note that the "match" filter will always be NULL for
+ * this action since SECCOMP_RET_ALLOW is the starting
+ * state in seccomp_run_filters().
+ */
return 0;
- case SECCOMP_RET_KILL:
+ case SECCOMP_RET_KILL_THREAD:
+ case SECCOMP_RET_KILL_PROCESS:
default:
- audit_seccomp(this_syscall, SIGSYS, action);
+ seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
- if (get_nr_threads(current) == 1) {
+ if (action == SECCOMP_RET_KILL_PROCESS ||
+ get_nr_threads(current) == 1) {
siginfo_t info;
/* Show the original registers in the dump. */
@@ -653,13 +748,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
seccomp_init_siginfo(&info, this_syscall, data);
do_coredump(&info);
}
- do_exit(SIGSYS);
+ if (action == SECCOMP_RET_KILL_PROCESS)
+ do_group_exit(SIGSYS);
+ else
+ do_exit(SIGSYS);
}
unreachable();
skip:
- audit_seccomp(this_syscall, 0, action);
+ seccomp_log(this_syscall, 0, action, match ? match->log : false);
return -1;
}
#else
@@ -794,6 +892,29 @@ static inline long seccomp_set_mode_filter(unsigned int flags,
}
#endif
+static long seccomp_get_action_avail(const char __user *uaction)
+{
+ u32 action;
+
+ if (copy_from_user(&action, uaction, sizeof(action)))
+ return -EFAULT;
+
+ switch (action) {
+ case SECCOMP_RET_KILL_PROCESS:
+ case SECCOMP_RET_KILL_THREAD:
+ case SECCOMP_RET_TRAP:
+ case SECCOMP_RET_ERRNO:
+ case SECCOMP_RET_TRACE:
+ case SECCOMP_RET_LOG:
+ case SECCOMP_RET_ALLOW:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return 0;
+}
+
/* Common entry point for both prctl and syscall. */
static long do_seccomp(unsigned int op, unsigned int flags,
const char __user *uargs)
@@ -805,6 +926,11 @@ static long do_seccomp(unsigned int op, unsigned int flags,
return seccomp_set_mode_strict();
case SECCOMP_SET_MODE_FILTER:
return seccomp_set_mode_filter(flags, uargs);
+ case SECCOMP_GET_ACTION_AVAIL:
+ if (flags != 0)
+ return -EINVAL;
+
+ return seccomp_get_action_avail(uargs);
default:
return -EINVAL;
}
@@ -908,13 +1034,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
if (!data)
goto out;
- get_seccomp_filter(task);
+ __get_seccomp_filter(filter);
spin_unlock_irq(&task->sighand->siglock);
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
- put_seccomp_filter(task);
+ __put_seccomp_filter(filter);
return ret;
out:
@@ -922,3 +1048,185 @@ out:
return ret;
}
#endif
+
+#ifdef CONFIG_SYSCTL
+
+/* Human readable action names for friendly sysctl interaction */
+#define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
+#define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
+#define SECCOMP_RET_TRAP_NAME "trap"
+#define SECCOMP_RET_ERRNO_NAME "errno"
+#define SECCOMP_RET_TRACE_NAME "trace"
+#define SECCOMP_RET_LOG_NAME "log"
+#define SECCOMP_RET_ALLOW_NAME "allow"
+
+static const char seccomp_actions_avail[] =
+ SECCOMP_RET_KILL_PROCESS_NAME " "
+ SECCOMP_RET_KILL_THREAD_NAME " "
+ SECCOMP_RET_TRAP_NAME " "
+ SECCOMP_RET_ERRNO_NAME " "
+ SECCOMP_RET_TRACE_NAME " "
+ SECCOMP_RET_LOG_NAME " "
+ SECCOMP_RET_ALLOW_NAME;
+
+struct seccomp_log_name {
+ u32 log;
+ const char *name;
+};
+
+static const struct seccomp_log_name seccomp_log_names[] = {
+ { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
+ { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
+ { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
+ { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
+ { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
+ { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
+ { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
+ { }
+};
+
+static bool seccomp_names_from_actions_logged(char *names, size_t size,
+ u32 actions_logged)
+{
+ const struct seccomp_log_name *cur;
+ bool append_space = false;
+
+ for (cur = seccomp_log_names; cur->name && size; cur++) {
+ ssize_t ret;
+
+ if (!(actions_logged & cur->log))
+ continue;
+
+ if (append_space) {
+ ret = strscpy(names, " ", size);
+ if (ret < 0)
+ return false;
+
+ names += ret;
+ size -= ret;
+ } else
+ append_space = true;
+
+ ret = strscpy(names, cur->name, size);
+ if (ret < 0)
+ return false;
+
+ names += ret;
+ size -= ret;
+ }
+
+ return true;
+}
+
+static bool seccomp_action_logged_from_name(u32 *action_logged,
+ const char *name)
+{
+ const struct seccomp_log_name *cur;
+
+ for (cur = seccomp_log_names; cur->name; cur++) {
+ if (!strcmp(cur->name, name)) {
+ *action_logged = cur->log;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
+{
+ char *name;
+
+ *actions_logged = 0;
+ while ((name = strsep(&names, " ")) && *name) {
+ u32 action_logged = 0;
+
+ if (!seccomp_action_logged_from_name(&action_logged, name))
+ return false;
+
+ *actions_logged |= action_logged;
+ }
+
+ return true;
+}
+
+static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ char names[sizeof(seccomp_actions_avail)];
+ struct ctl_table table;
+ int ret;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ memset(names, 0, sizeof(names));
+
+ if (!write) {
+ if (!seccomp_names_from_actions_logged(names, sizeof(names),
+ seccomp_actions_logged))
+ return -EINVAL;
+ }
+
+ table = *ro_table;
+ table.data = names;
+ table.maxlen = sizeof(names);
+ ret = proc_dostring(&table, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ if (write) {
+ u32 actions_logged;
+
+ if (!seccomp_actions_logged_from_names(&actions_logged,
+ table.data))
+ return -EINVAL;
+
+ if (actions_logged & SECCOMP_LOG_ALLOW)
+ return -EINVAL;
+
+ seccomp_actions_logged = actions_logged;
+ }
+
+ return 0;
+}
+
+static struct ctl_path seccomp_sysctl_path[] = {
+ { .procname = "kernel", },
+ { .procname = "seccomp", },
+ { }
+};
+
+static struct ctl_table seccomp_sysctl_table[] = {
+ {
+ .procname = "actions_avail",
+ .data = (void *) &seccomp_actions_avail,
+ .maxlen = sizeof(seccomp_actions_avail),
+ .mode = 0444,
+ .proc_handler = proc_dostring,
+ },
+ {
+ .procname = "actions_logged",
+ .mode = 0644,
+ .proc_handler = seccomp_actions_logged_handler,
+ },
+ { }
+};
+
+static int __init seccomp_sysctl_init(void)
+{
+ struct ctl_table_header *hdr;
+
+ hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
+ if (!hdr)
+ pr_warn("seccomp: sysctl registration failed\n");
+ else
+ kmemleak_not_leak(hdr);
+
+ return 0;
+}
+
+device_initcall(seccomp_sysctl_init)
+
+#endif /* CONFIG_SYSCTL */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6648fbbb8157..4da9e622471f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,7 +367,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_sched_time_avg,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
#ifdef CONFIG_SCHEDSTATS
{
@@ -2187,8 +2188,6 @@ static int do_proc_douintvec_conv(unsigned long *lvalp,
if (write) {
if (*lvalp > UINT_MAX)
return -EINVAL;
- if (*lvalp > UINT_MAX)
- return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2a685b45b73b..45a3928544ce 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -648,6 +648,12 @@ int blk_trace_startstop(struct request_queue *q, int start)
}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ */
+
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
@@ -665,7 +671,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
switch (cmd) {
case BLKTRACESETUP:
@@ -691,7 +697,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
break;
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
return ret;
}
@@ -1727,7 +1733,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
@@ -1746,7 +1752,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
@@ -1788,7 +1794,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
if (attr == &dev_attr_enable) {
if (value)
@@ -1814,7 +1820,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6abfafd7f173..8319e09e15b9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4954,9 +4954,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_graph_set_hash(struct ftrace_hash *hash, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -6808,17 +6805,6 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * Function graph does not allocate the trampoline, but
- * other global_ops do. We need to reset the ALLOC_TRAMP flag
- * if one was used.
- */
- global_ops.trampoline = save_global_trampoline;
- if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
- global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5360b7aec57a..752e5daf0896 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4020,11 +4020,17 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->current_trace->print_max)
+ trace_buf = &tr->max_buffer;
+#endif
if (cpu == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(&tr->trace_buffer, cpu);
+ tracing_reset(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -5358,6 +5364,13 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t == tr->current_trace)
goto out;
+ /* Some tracers won't work on kernel command line */
+ if (system_state < SYSTEM_RUNNING && t->noboot) {
+ pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
+ t->name);
+ goto out;
+ }
+
/* Some tracers are only allowed for the top level buffer */
if (!trace_ok_for_array(t, tr)) {
ret = -EINVAL;
@@ -5667,7 +5680,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracing_is_on() && iter->pos)
+ if (!tracer_tracing_is_on(iter->tr) && iter->pos)
break;
mutex_unlock(&iter->mutex);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fb5d54d0d1b3..652c682707cd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -444,6 +444,8 @@ struct tracer {
#ifdef CONFIG_TRACER_MAX_TRACE
bool use_max_tr;
#endif
+ /* True if tracer cannot be enabled in kernel param */
+ bool noboot;
};
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index cd7480d0a201..dca78fc48439 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -282,6 +282,7 @@ static struct tracer mmio_tracer __read_mostly =
.close = mmio_close,
.read = mmio_read,
.print_line = mmio_print_line,
+ .noboot = true,
};
__init static int init_mmio_trace(void)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..0c331978b1a6 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
+ entry->prev_state = __get_task_state(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
- entry->next_state = next->state;
+ entry->next_state = __get_task_state(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
+ entry->prev_state = __get_task_state(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
+ entry->next_state = __get_task_state(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a4df67cbc711..49cb41412eec 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -96,23 +96,9 @@ check_stack(unsigned long ip, unsigned long *stack)
if (in_nmi())
return;
- /*
- * There's a slight chance that we are tracing inside the
- * RCU infrastructure, and rcu_irq_enter() will not work
- * as expected.
- */
- if (unlikely(rcu_irq_enter_disabled()))
- return;
-
local_irq_save(flags);
arch_spin_lock(&stack_trace_max_lock);
- /*
- * RCU may not be watching, make it see us.
- * The stack trace code uses rcu_sched.
- */
- rcu_irq_enter();
-
/* In case another CPU set the tracer_frame on us */
if (unlikely(!frame_size))
this_size -= tracer_frame;
@@ -205,7 +191,6 @@ check_stack(unsigned long ip, unsigned long *stack)
}
out:
- rcu_irq_exit();
arch_spin_unlock(&stack_trace_max_lock);
local_irq_restore(flags);
}