Merge branch 'linus' into perf/urgent, to synchronize UAPI headers

Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2017-12-06 22:39:39 +0100
committer: Ingo Molnar <mingo@kernel.org> 2017-12-06 22:39:39 +0100
commit: d6eabce2577a695197e9433302fd6a9f0e1a7666 (patch)
tree: e5c5bcc7803879d5d911cc8faeb74a946c242395 /kernel
parent: Merge branch 'perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/a... (diff)
parent: x86: don't hash faulting address in oops printout (diff)
download: linux-d6eabce2577a695197e9433302fd6a9f0e1a7666.tar.xz
linux-d6eabce2577a695197e9433302fd6a9f0e1a7666.zip
102 files changed, 6375 insertions, 2205 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index be1c28fd4d57..227db99b0f19 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -85,13 +85,13 @@ static int	audit_initialized;
 #define AUDIT_OFF	0
 #define AUDIT_ON	1
 #define AUDIT_LOCKED	2
-u32		audit_enabled;
-u32		audit_ever_enabled;
+u32		audit_enabled = AUDIT_OFF;
+bool		audit_ever_enabled = !!AUDIT_OFF;
 
 EXPORT_SYMBOL_GPL(audit_enabled);
 
 /* Default state when kernel boots without any parameters. */
-static u32	audit_default;
+static u32	audit_default = AUDIT_OFF;
 
 /* If auditing cannot proceed, audit_failure selects what happens. */
 static u32	audit_failure = AUDIT_FAIL_PRINTK;
@@ -1197,25 +1197,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			pid_t auditd_pid;
 			struct pid *req_pid = task_tgid(current);
 
-			/* sanity check - PID values must match */
-			if (new_pid != pid_vnr(req_pid))
+			/* Sanity check - PID values must match. Setting
+			 * pid to 0 is how auditd ends auditing. */
+			if (new_pid && (new_pid != pid_vnr(req_pid)))
 				return -EINVAL;
 
 			/* test the auditd connection */
 			audit_replace(req_pid);
 
 			auditd_pid = auditd_pid_vnr();
-			/* only the current auditd can unregister itself */
-			if ((!new_pid) && (new_pid != auditd_pid)) {
-				audit_log_config_change("audit_pid", new_pid,
-							auditd_pid, 0);
-				return -EACCES;
-			}
-			/* replacing a healthy auditd is not allowed */
-			if (auditd_pid && new_pid) {
-				audit_log_config_change("audit_pid", new_pid,
-							auditd_pid, 0);
-				return -EEXIST;
+			if (auditd_pid) {
+				/* replacing a healthy auditd is not allowed */
+				if (new_pid) {
+					audit_log_config_change("audit_pid",
+							new_pid, auditd_pid, 0);
+					return -EEXIST;
+				}
+				/* only current auditd can unregister itself */
+				if (pid_vnr(req_pid) != auditd_pid) {
+					audit_log_config_change("audit_pid",
+							new_pid, auditd_pid, 0);
+					return -EACCES;
+				}
 			}
 
 			if (new_pid) {
@@ -1549,8 +1552,6 @@ static int __init audit_init(void)
 	register_pernet_subsys(&audit_net_ops);
 
 	audit_initialized = AUDIT_INITIALIZED;
-	audit_enabled = audit_default;
-	audit_ever_enabled |= !!audit_default;
 
 	kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
 	if (IS_ERR(kauditd_task)) {
@@ -1564,14 +1565,21 @@ static int __init audit_init(void)
 
 	return 0;
 }
-__initcall(audit_init);
+postcore_initcall(audit_init);
 
 /* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
 static int __init audit_enable(char *str)
 {
-	audit_default = !!simple_strtol(str, NULL, 0);
-	if (!audit_default)
+	long val;
+
+	if (kstrtol(str, 0, &val))
+		panic("audit: invalid 'audit' parameter value (%s)\n", str);
+	audit_default = (val ? AUDIT_ON : AUDIT_OFF);
+
+	if (audit_default == AUDIT_OFF)
 		audit_initialized = AUDIT_DISABLED;
+	if (audit_set_enabled(audit_default))
+		panic("audit: error setting audit state (%d)\n", audit_default);
 
 	pr_info("%s\n", audit_default ?
 		"enabled (after initialization)" : "disabled (until reboot)");
@@ -2337,32 +2345,6 @@ void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type,
 	}
 }
 
-#ifdef CONFIG_SECURITY
-/**
- * audit_log_secctx - Converts and logs SELinux context
- * @ab: audit_buffer
- * @secid: security number
- *
- * This is a helper function that calls security_secid_to_secctx to convert
- * secid to secctx and then adds the (converted) SELinux context to the audit
- * log by calling audit_log_format, thus also preventing leak of internal secid
- * to userspace. If secid cannot be converted audit_panic is called.
- */
-void audit_log_secctx(struct audit_buffer *ab, u32 secid)
-{
-	u32 len;
-	char *secctx;
-
-	if (security_secid_to_secctx(secid, &secctx, &len)) {
-		audit_panic("Cannot convert secid to context");
-	} else {
-		audit_log_format(ab, " obj=%s", secctx);
-		security_release_secctx(secctx, len);
-	}
-}
-EXPORT_SYMBOL(audit_log_secctx);
-#endif
-
 EXPORT_SYMBOL(audit_log_start);
 EXPORT_SYMBOL(audit_log_end);
 EXPORT_SYMBOL(audit_log_format);
diff --git a/kernel/audit.h b/kernel/audit.h
index 9b110ae17ee3..af5bc59487ed 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -208,7 +208,7 @@ struct audit_context {
 	struct audit_proctitle proctitle;
 };
 
-extern u32 audit_ever_enabled;
+extern bool audit_ever_enabled;
 
 extern void audit_copy_inode(struct audit_names *name,
 			     const struct dentry *dentry,
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index d4b050d9a66e..fd353120e0d9 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -1008,7 +1008,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify
 	 * We are guaranteed to have at least one reference to the mark from
 	 * either the inode or the caller of fsnotify_destroy_mark().
 	 */
-	BUG_ON(atomic_read(&entry->refcnt) < 1);
+	BUG_ON(refcount_read(&entry->refcnt) < 1);
 }
 
 static const struct fsnotify_ops audit_tree_ops = {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0b0aa5854dac..4a1758adb222 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -56,7 +56,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_filter_list[3]),
 	LIST_HEAD_INIT(audit_filter_list[4]),
 	LIST_HEAD_INIT(audit_filter_list[5]),
-#if AUDIT_NR_FILTERS != 6
+	LIST_HEAD_INIT(audit_filter_list[6]),
+#if AUDIT_NR_FILTERS != 7
 #error Fix audit_filter_list initialiser
 #endif
 };
@@ -67,6 +68,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
 	LIST_HEAD_INIT(audit_rules_list[3]),
 	LIST_HEAD_INIT(audit_rules_list[4]),
 	LIST_HEAD_INIT(audit_rules_list[5]),
+	LIST_HEAD_INIT(audit_rules_list[6]),
 };
 
 DEFINE_MUTEX(audit_filter_mutex);
@@ -263,6 +265,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
 #endif
 	case AUDIT_FILTER_USER:
 	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		;
 	}
 	if (unlikely(rule->action == AUDIT_POSSIBLE)) {
@@ -338,6 +341,21 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 		    entry->rule.listnr != AUDIT_FILTER_USER)
 			return -EINVAL;
 		break;
+	case AUDIT_FSTYPE:
+		if (entry->rule.listnr != AUDIT_FILTER_FS)
+			return -EINVAL;
+		break;
+	}
+
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_FS:
+		switch(f->type) {
+		case AUDIT_FSTYPE:
+		case AUDIT_FILTERKEY:
+			break;
+		default:
+			return -EINVAL;
+		}
 	}
 
 	switch(f->type) {
@@ -391,6 +409,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 			return -EINVAL;
 	/* FALL THROUGH */
 	case AUDIT_ARCH:
+	case AUDIT_FSTYPE:
 		if (f->op != Audit_not_equal && f->op != Audit_equal)
 			return -EINVAL;
 		break;
@@ -910,10 +929,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
@@ -989,10 +1011,13 @@ int audit_del_rule(struct audit_entry *entry)
 #ifdef CONFIG_AUDITSYSCALL
 	int dont_count = 0;
 
-	/* If either of these, don't count towards total */
-	if (entry->rule.listnr == AUDIT_FILTER_USER ||
-		entry->rule.listnr == AUDIT_FILTER_TYPE)
+	/* If any of these, don't count towards total */
+	switch(entry->rule.listnr) {
+	case AUDIT_FILTER_USER:
+	case AUDIT_FILTER_TYPE:
+	case AUDIT_FILTER_FS:
 		dont_count = 1;
+	}
 #endif
 
 	mutex_lock(&audit_filter_mutex);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ecc23e25c9eb..e80459f7e132 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1869,10 +1869,33 @@ void __audit_inode_child(struct inode *parent,
 	struct inode *inode = d_backing_inode(dentry);
 	const char *dname = dentry->d_name.name;
 	struct audit_names *n, *found_parent = NULL, *found_child = NULL;
+	struct audit_entry *e;
+	struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
+	int i;
 
 	if (!context->in_syscall)
 		return;
 
+	rcu_read_lock();
+	if (!list_empty(list)) {
+		list_for_each_entry_rcu(e, list, list) {
+			for (i = 0; i < e->rule.field_count; i++) {
+				struct audit_field *f = &e->rule.fields[i];
+
+				if (f->type == AUDIT_FSTYPE) {
+					if (audit_comparator(parent->i_sb->s_magic,
+					    f->op, f->val)) {
+						if (e->rule.action == AUDIT_NEVER) {
+							rcu_read_unlock();
+							return;
+						}
+					}
+				}
+			}
+		}
+	}
+	rcu_read_unlock();
+
 	if (inode)
 		handle_one(inode);
 
@@ -2390,6 +2413,12 @@ void __audit_log_kern_module(char *name)
 	context->type = AUDIT_KERN_MODULE;
 }
 
+void __audit_fanotify(unsigned int response)
+{
+	audit_log(current->audit_context, GFP_KERNEL,
+		AUDIT_FANOTIFY,	"resp=%u", response);
+}
+
 static void audit_log_task(struct audit_buffer *ab)
 {
 	kuid_t auid, uid;
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index af3ab6164ff5..e691da0b3bab 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -3,8 +3,11 @@ obj-y := core.o
 
 obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
+obj-$(CONFIG_BPF_SYSCALL) += disasm.o
 ifeq ($(CONFIG_NET),y)
 obj-$(CONFIG_BPF_SYSCALL) += devmap.o
+obj-$(CONFIG_BPF_SYSCALL) += cpumap.o
+obj-$(CONFIG_BPF_SYSCALL) += offload.o
 ifeq ($(CONFIG_STREAM_PARSER),y)
 obj-$(CONFIG_BPF_SYSCALL) += sockmap.o
 endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c4b9ab01bba5..7c25426d3cf5 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
 
 #include "map_in_map.h"
 
+#define ARRAY_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 static void bpf_array_free_percpu(struct bpf_array *array)
 {
 	int i;
@@ -56,7 +59,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+	    attr->value_size == 0 ||
+	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
 	    (percpu && numa_node != NUMA_NO_NODE))
 		return ERR_PTR(-EINVAL);
 
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 546113430049..b789ab78d28f 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -27,129 +27,405 @@ void cgroup_bpf_put(struct cgroup *cgrp)
 {
 	unsigned int type;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
-		struct bpf_prog *prog = cgrp->bpf.prog[type];
-
-		if (prog) {
-			bpf_prog_put(prog);
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
+		struct list_head *progs = &cgrp->bpf.progs[type];
+		struct bpf_prog_list *pl, *tmp;
+
+		list_for_each_entry_safe(pl, tmp, progs, node) {
+			list_del(&pl->node);
+			bpf_prog_put(pl->prog);
+			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
+		bpf_prog_array_free(cgrp->bpf.effective[type]);
+	}
+}
+
+/* count number of elements in the list.
+ * it's slow but the list cannot be long
+ */
+static u32 prog_list_length(struct list_head *head)
+{
+	struct bpf_prog_list *pl;
+	u32 cnt = 0;
+
+	list_for_each_entry(pl, head, node) {
+		if (!pl->prog)
+			continue;
+		cnt++;
 	}
+	return cnt;
+}
+
+/* if parent has non-overridable prog attached,
+ * disallow attaching new programs to the descendent cgroup.
+ * if parent has overridable or multi-prog, allow attaching
+ */
+static bool hierarchy_allows_attach(struct cgroup *cgrp,
+				    enum bpf_attach_type type,
+				    u32 new_flags)
+{
+	struct cgroup *p;
+
+	p = cgroup_parent(cgrp);
+	if (!p)
+		return true;
+	do {
+		u32 flags = p->bpf.flags[type];
+		u32 cnt;
+
+		if (flags & BPF_F_ALLOW_MULTI)
+			return true;
+		cnt = prog_list_length(&p->bpf.progs[type]);
+		WARN_ON_ONCE(cnt > 1);
+		if (cnt == 1)
+			return !!(flags & BPF_F_ALLOW_OVERRIDE);
+		p = cgroup_parent(p);
+	} while (p);
+	return true;
+}
+
+/* compute a chain of effective programs for a given cgroup:
+ * start from the list of programs in this cgroup and add
+ * all parent programs.
+ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding
+ * to programs in this cgroup
+ */
+static int compute_effective_progs(struct cgroup *cgrp,
+				   enum bpf_attach_type type,
+				   struct bpf_prog_array __rcu **array)
+{
+	struct bpf_prog_array __rcu *progs;
+	struct bpf_prog_list *pl;
+	struct cgroup *p = cgrp;
+	int cnt = 0;
+
+	/* count number of effective programs by walking parents */
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			cnt += prog_list_length(&p->bpf.progs[type]);
+		p = cgroup_parent(p);
+	} while (p);
+
+	progs = bpf_prog_array_alloc(cnt, GFP_KERNEL);
+	if (!progs)
+		return -ENOMEM;
+
+	/* populate the array with effective progs */
+	cnt = 0;
+	p = cgrp;
+	do {
+		if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI))
+			list_for_each_entry(pl,
+					    &p->bpf.progs[type], node) {
+				if (!pl->prog)
+					continue;
+				rcu_dereference_protected(progs, 1)->
+					progs[cnt++] = pl->prog;
+			}
+		p = cgroup_parent(p);
+	} while (p);
+
+	*array = progs;
+	return 0;
+}
+
+static void activate_effective_progs(struct cgroup *cgrp,
+				     enum bpf_attach_type type,
+				     struct bpf_prog_array __rcu *array)
+{
+	struct bpf_prog_array __rcu *old_array;
+
+	old_array = xchg(&cgrp->bpf.effective[type], array);
+	/* free prog array after grace period, since __cgroup_bpf_run_*()
+	 * might be still walking the array
+	 */
+	bpf_prog_array_free(old_array);
 }
 
 /**
  * cgroup_bpf_inherit() - inherit effective programs from parent
  * @cgrp: the cgroup to modify
- * @parent: the parent to inherit from
  */
-void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+int cgroup_bpf_inherit(struct cgroup *cgrp)
 {
-	unsigned int type;
+/* has to use marco instead of const int, since compiler thinks
+ * that array below is variable length
+ */
+#define	NR ARRAY_SIZE(cgrp->bpf.effective)
+	struct bpf_prog_array __rcu *arrays[NR] = {};
+	int i;
 
-	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
-		struct bpf_prog *e;
+	for (i = 0; i < NR; i++)
+		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
-		e = rcu_dereference_protected(parent->bpf.effective[type],
-					      lockdep_is_held(&cgroup_mutex));
-		rcu_assign_pointer(cgrp->bpf.effective[type], e);
-		cgrp->bpf.disallow_override[type] = parent->bpf.disallow_override[type];
-	}
+	for (i = 0; i < NR; i++)
+		if (compute_effective_progs(cgrp, i, &arrays[i]))
+			goto cleanup;
+
+	for (i = 0; i < NR; i++)
+		activate_effective_progs(cgrp, i, arrays[i]);
+
+	return 0;
+cleanup:
+	for (i = 0; i < NR; i++)
+		bpf_prog_array_free(arrays[i]);
+	return -ENOMEM;
 }
 
+#define BPF_CGROUP_MAX_PROGS 64
+
 /**
- * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ * __cgroup_bpf_attach() - Attach the program to a cgroup, and
  *                         propagate the change to descendants
  * @cgrp: The cgroup which descendants to traverse
- * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
- * @prog: A new program to pin
- * @type: Type of pinning operation (ingress/egress)
- *
- * Each cgroup has a set of two pointers for bpf programs; one for eBPF
- * programs it owns, and which is effective for execution.
- *
- * If @prog is not %NULL, this function attaches a new program to the cgroup
- * and releases the one that is currently attached, if any. @prog is then made
- * the effective program of type @type in that cgroup.
- *
- * If @prog is %NULL, the currently attached program of type @type is released,
- * and the effective program of the parent cgroup (if any) is inherited to
- * @cgrp.
- *
- * Then, the descendants of @cgrp are walked and the effective program for
- * each of them is set to the effective program of @cgrp unless the
- * descendant has its own program attached, in which case the subbranch is
- * skipped. This ensures that delegated subcgroups with own programs are left
- * untouched.
+ * @prog: A program to attach
+ * @type: Type of attach operation
  *
  * Must be called with cgroup_mutex held.
  */
-int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent,
-			struct bpf_prog *prog, enum bpf_attach_type type,
-			bool new_overridable)
+int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 flags)
 {
-	struct bpf_prog *old_prog, *effective = NULL;
-	struct cgroup_subsys_state *pos;
-	bool overridable = true;
-
-	if (parent) {
-		overridable = !parent->bpf.disallow_override[type];
-		effective = rcu_dereference_protected(parent->bpf.effective[type],
-						      lockdep_is_held(&cgroup_mutex));
-	}
-
-	if (prog && effective && !overridable)
-		/* if parent has non-overridable prog attached, disallow
-		 * attaching new programs to descendent cgroup
-		 */
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	bool pl_was_allocated;
+	int err;
+
+	if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI))
+		/* invalid combination */
+		return -EINVAL;
+
+	if (!hierarchy_allows_attach(cgrp, type, flags))
 		return -EPERM;
 
-	if (prog && effective && overridable != new_overridable)
-		/* if parent has overridable prog attached, only
-		 * allow overridable programs in descendent cgroup
+	if (!list_empty(progs) && cgrp->bpf.flags[type] != flags)
+		/* Disallow attaching non-overridable on top
+		 * of existing overridable in this cgroup.
+		 * Disallow attaching multi-prog if overridable or none
 		 */
 		return -EPERM;
 
-	old_prog = cgrp->bpf.prog[type];
-
-	if (prog) {
-		overridable = new_overridable;
-		effective = prog;
-		if (old_prog &&
-		    cgrp->bpf.disallow_override[type] == new_overridable)
-			/* disallow attaching non-overridable on top
-			 * of existing overridable in this cgroup
-			 * and vice versa
-			 */
-			return -EPERM;
+	if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+		return -E2BIG;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		list_for_each_entry(pl, progs, node)
+			if (pl->prog == prog)
+				/* disallow attaching the same prog twice */
+				return -EINVAL;
+
+		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+		if (!pl)
+			return -ENOMEM;
+		pl_was_allocated = true;
+		pl->prog = prog;
+		list_add_tail(&pl->node, progs);
+	} else {
+		if (list_empty(progs)) {
+			pl = kmalloc(sizeof(*pl), GFP_KERNEL);
+			if (!pl)
+				return -ENOMEM;
+			pl_was_allocated = true;
+			list_add_tail(&pl->node, progs);
+		} else {
+			pl = list_first_entry(progs, typeof(*pl), node);
+			old_prog = pl->prog;
+			pl_was_allocated = false;
+		}
+		pl->prog = prog;
 	}
 
-	if (!prog && !old_prog)
-		/* report error when trying to detach and nothing is attached */
-		return -ENOENT;
+	cgrp->bpf.flags[type] = flags;
 
-	cgrp->bpf.prog[type] = prog;
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
-	css_for_each_descendant_pre(pos, &cgrp->self) {
-		struct cgroup *desc = container_of(pos, struct cgroup, self);
-
-		/* skip the subtree if the descendant has its own program */
-		if (desc->bpf.prog[type] && desc != cgrp) {
-			pos = css_rightmost_descendant(pos);
-		} else {
-			rcu_assign_pointer(desc->bpf.effective[type],
-					   effective);
-			desc->bpf.disallow_override[type] = !overridable;
-		}
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
 	}
 
-	if (prog)
-		static_branch_inc(&cgroup_bpf_enabled_key);
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
 
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	static_branch_inc(&cgroup_bpf_enabled_key);
 	if (old_prog) {
 		bpf_prog_put(old_prog);
 		static_branch_dec(&cgroup_bpf_enabled_key);
 	}
 	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and cleanup the prog list */
+	pl->prog = old_prog;
+	if (pl_was_allocated) {
+		list_del(&pl->node);
+		kfree(pl);
+	}
+	return err;
+}
+
+/**
+ * __cgroup_bpf_detach() - Detach the program from a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @prog: A program to detach or NULL
+ * @type: Type of detach operation
+ *
+ * Must be called with cgroup_mutex held.
+ */
+int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+			enum bpf_attach_type type, u32 unused_flags)
+{
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	struct bpf_prog *old_prog = NULL;
+	struct cgroup_subsys_state *css;
+	struct bpf_prog_list *pl;
+	int err;
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		if (!prog)
+			/* to detach MULTI prog the user has to specify valid FD
+			 * of the program to be detached
+			 */
+			return -EINVAL;
+	} else {
+		if (list_empty(progs))
+			/* report error when trying to detach and nothing is attached */
+			return -ENOENT;
+	}
+
+	if (flags & BPF_F_ALLOW_MULTI) {
+		/* find the prog and detach it */
+		list_for_each_entry(pl, progs, node) {
+			if (pl->prog != prog)
+				continue;
+			old_prog = prog;
+			/* mark it deleted, so it's ignored while
+			 * recomputing effective
+			 */
+			pl->prog = NULL;
+			break;
+		}
+		if (!old_prog)
+			return -ENOENT;
+	} else {
+		/* to maintain backward compatibility NONE and OVERRIDE cgroups
+		 * allow detaching with invalid FD (prog==NULL)
+		 */
+		pl = list_first_entry(progs, typeof(*pl), node);
+		old_prog = pl->prog;
+		pl->prog = NULL;
+	}
+
+	/* allocate and recompute effective prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		err = compute_effective_progs(desc, type, &desc->bpf.inactive);
+		if (err)
+			goto cleanup;
+	}
+
+	/* all allocations were successful. Activate all prog arrays */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		activate_effective_progs(desc, type, desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* now can actually delete it from this cgroup list */
+	list_del(&pl->node);
+	kfree(pl);
+	if (list_empty(progs))
+		/* last program was detached, reset flags to zero */
+		cgrp->bpf.flags[type] = 0;
+
+	bpf_prog_put(old_prog);
+	static_branch_dec(&cgroup_bpf_enabled_key);
+	return 0;
+
+cleanup:
+	/* oom while computing effective. Free all computed effective arrays
+	 * since they were not activated
+	 */
+	css_for_each_descendant_pre(css, &cgrp->self) {
+		struct cgroup *desc = container_of(css, struct cgroup, self);
+
+		bpf_prog_array_free(desc->bpf.inactive);
+		desc->bpf.inactive = NULL;
+	}
+
+	/* and restore back old_prog */
+	pl->prog = old_prog;
+	return err;
+}
+
+/* Must be called with cgroup_mutex held to avoid races. */
+int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		       union bpf_attr __user *uattr)
+{
+	__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
+	enum bpf_attach_type type = attr->query.attach_type;
+	struct list_head *progs = &cgrp->bpf.progs[type];
+	u32 flags = cgrp->bpf.flags[type];
+	int cnt, ret = 0, i;
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
+		cnt = bpf_prog_array_length(cgrp->bpf.effective[type]);
+	else
+		cnt = prog_list_length(progs);
+
+	if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
+		return -EFAULT;
+	if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+		return -EFAULT;
+	if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+		/* return early if user requested only program count + flags */
+		return 0;
+	if (attr->query.prog_cnt < cnt) {
+		cnt = attr->query.prog_cnt;
+		ret = -ENOSPC;
+	}
+
+	if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+		return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type],
+						   prog_ids, cnt);
+	} else {
+		struct bpf_prog_list *pl;
+		u32 id;
+
+		i = 0;
+		list_for_each_entry(pl, progs, node) {
+			id = pl->prog->aux->id;
+			if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+				return -EFAULT;
+			if (++i == cnt)
+				break;
+		}
+	}
+	return ret;
 }
 
 /**
@@ -171,36 +447,26 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
 				enum bpf_attach_type type)
 {
-	struct bpf_prog *prog;
+	unsigned int offset = skb->data - skb_network_header(skb);
+	struct sock *save_sk;
 	struct cgroup *cgrp;
-	int ret = 0;
+	int ret;
 
 	if (!sk || !sk_fullsock(sk))
 		return 0;
 
-	if (sk->sk_family != AF_INET &&
-	    sk->sk_family != AF_INET6)
+	if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)
 		return 0;
 
 	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog) {
-		unsigned int offset = skb->data - skb_network_header(skb);
-		struct sock *save_sk = skb->sk;
-
-		skb->sk = sk;
-		__skb_push(skb, offset);
-		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
-		__skb_pull(skb, offset);
-		skb->sk = save_sk;
-	}
-
-	rcu_read_unlock();
-
-	return ret;
+	save_sk = skb->sk;
+	skb->sk = sk;
+	__skb_push(skb, offset);
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb,
+				 bpf_prog_run_save_cb);
+	__skb_pull(skb, offset);
+	skb->sk = save_sk;
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb);
 
@@ -221,19 +487,10 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
 			       enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
+	int ret;
 
-
-	rcu_read_lock();
-
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM;
-
-	rcu_read_unlock();
-
-	return ret;
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
 
@@ -258,18 +515,77 @@ int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
 				     enum bpf_attach_type type)
 {
 	struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
-	struct bpf_prog *prog;
-	int ret = 0;
+	int ret;
 
+	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops,
+				 BPF_PROG_RUN);
+	return ret == 1 ? 0 : -EPERM;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
+				      short access, enum bpf_attach_type type)
+{
+	struct cgroup *cgrp;
+	struct bpf_cgroup_dev_ctx ctx = {
+		.access_type = (access << 16) | dev_type,
+		.major = major,
+		.minor = minor,
+	};
+	int allow = 1;
 
 	rcu_read_lock();
+	cgrp = task_dfl_cgroup(current);
+	allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx,
+				   BPF_PROG_RUN);
+	rcu_read_unlock();
 
-	prog = rcu_dereference(cgrp->bpf.effective[type]);
-	if (prog)
-		ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+	return !allow;
+}
+EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission);
 
-	rcu_read_unlock();
+static const struct bpf_func_proto *
+cgroup_dev_func_proto(enum bpf_func_id func_id)
+{
+	switch (func_id) {
+	case BPF_FUNC_map_lookup_elem:
+		return &bpf_map_lookup_elem_proto;
+	case BPF_FUNC_map_update_elem:
+		return &bpf_map_update_elem_proto;
+	case BPF_FUNC_map_delete_elem:
+		return &bpf_map_delete_elem_proto;
+	case BPF_FUNC_get_current_uid_gid:
+		return &bpf_get_current_uid_gid_proto;
+	case BPF_FUNC_trace_printk:
+		if (capable(CAP_SYS_ADMIN))
+			return bpf_get_trace_printk_proto();
+	default:
+		return NULL;
+	}
+}
 
-	return ret;
+static bool cgroup_dev_is_valid_access(int off, int size,
+				       enum bpf_access_type type,
+				       struct bpf_insn_access_aux *info)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx))
+		return false;
+	/* The verifier guarantees that size > 0. */
+	if (off % size != 0)
+		return false;
+	if (size != sizeof(__u32))
+		return false;
+
+	return true;
 }
-EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
+
+const struct bpf_prog_ops cg_dev_prog_ops = {
+};
+
+const struct bpf_verifier_ops cg_dev_verifier_ops = {
+	.get_func_proto		= cgroup_dev_func_proto,
+	.is_valid_access	= cgroup_dev_is_valid_access,
+};
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7b62df86be1d..86b50aa26ee8 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
 	if (fp == NULL)
 		return NULL;
 
-	kmemcheck_annotate_bitfield(fp, meta);
-
 	aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
 	if (aux == NULL) {
 		vfree(fp);
@@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
 	if (fp == NULL) {
 		__bpf_prog_uncharge(fp_old->aux->user, delta);
 	} else {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
 		fp->pages = pages;
 		fp->aux->prog = fp;
@@ -309,12 +305,25 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog,
 
 static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym)
 {
+	const char *end = sym + KSYM_NAME_LEN;
+
 	BUILD_BUG_ON(sizeof("bpf_prog_") +
-		     sizeof(prog->tag) * 2 + 1 > KSYM_NAME_LEN);
+		     sizeof(prog->tag) * 2 +
+		     /* name has been null terminated.
+		      * We should need +1 for the '_' preceding
+		      * the name.  However, the null character
+		      * is double counted between the name and the
+		      * sizeof("bpf_prog_") above, so we omit
+		      * the +1 here.
+		      */
+		     sizeof(prog->aux->name) > KSYM_NAME_LEN);
 
 	sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_");
 	sym  = bin2hex(sym, prog->tag, sizeof(prog->tag));
-	*sym = 0;
+	if (prog->aux->name[0])
+		snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name);
+	else
+		*sym = 0;
 }
 
 static __always_inline unsigned long
@@ -662,8 +671,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other,
 
 	fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL);
 	if (fp != NULL) {
-		kmemcheck_annotate_bitfield(fp, meta);
-
 		/* aux->prog still points to the fp_other one, so
 		 * when promoting the clone to the real program,
 		 * this still needs to be adapted.
@@ -1367,7 +1374,13 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 	 * valid program, which in this case would simply not
 	 * be JITed, but falls back to the interpreter.
 	 */
-	fp = bpf_int_jit_compile(fp);
+	if (!bpf_prog_is_dev_bound(fp->aux)) {
+		fp = bpf_int_jit_compile(fp);
+	} else {
+		*err = bpf_prog_offload_compile(fp);
+		if (*err)
+			return fp;
+	}
 	bpf_prog_lock_ro(fp);
 
 	/* The tail call compatibility check can only be done at
@@ -1381,11 +1394,164 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
 
+static unsigned int __bpf_prog_ret1(const void *ctx,
+				    const struct bpf_insn *insn)
+{
+	return 1;
+}
+
+static struct bpf_prog_dummy {
+	struct bpf_prog prog;
+} dummy_bpf_prog = {
+	.prog = {
+		.bpf_func = __bpf_prog_ret1,
+	},
+};
+
+/* to avoid allocating empty bpf_prog_array for cgroups that
+ * don't have bpf program attached use one global 'empty_prog_array'
+ * It will not be modified the caller of bpf_prog_array_alloc()
+ * (since caller requested prog_cnt == 0)
+ * that pointer should be 'freed' by bpf_prog_array_free()
+ */
+static struct {
+	struct bpf_prog_array hdr;
+	struct bpf_prog *null_prog;
+} empty_prog_array = {
+	.null_prog = NULL,
+};
+
+struct bpf_prog_array __rcu *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags)
+{
+	if (prog_cnt)
+		return kzalloc(sizeof(struct bpf_prog_array) +
+			       sizeof(struct bpf_prog *) * (prog_cnt + 1),
+			       flags);
+
+	return &empty_prog_array.hdr;
+}
+
+void bpf_prog_array_free(struct bpf_prog_array __rcu *progs)
+{
+	if (!progs ||
+	    progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr)
+		return;
+	kfree_rcu(progs, rcu);
+}
+
+int bpf_prog_array_length(struct bpf_prog_array __rcu *progs)
+{
+	struct bpf_prog **prog;
+	u32 cnt = 0;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++)
+		if (*prog != &dummy_bpf_prog.prog)
+			cnt++;
+	rcu_read_unlock();
+	return cnt;
+}
+
+int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
+				__u32 __user *prog_ids, u32 cnt)
+{
+	struct bpf_prog **prog;
+	u32 i = 0, id;
+
+	rcu_read_lock();
+	prog = rcu_dereference(progs)->progs;
+	for (; *prog; prog++) {
+		id = (*prog)->aux->id;
+		if (copy_to_user(prog_ids + i, &id, sizeof(id))) {
+			rcu_read_unlock();
+			return -EFAULT;
+		}
+		if (++i == cnt) {
+			prog++;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	if (*prog)
+		return -ENOSPC;
+	return 0;
+}
+
+void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
+				struct bpf_prog *old_prog)
+{
+	struct bpf_prog **prog = progs->progs;
+
+	for (; *prog; prog++)
+		if (*prog == old_prog) {
+			WRITE_ONCE(*prog, &dummy_bpf_prog.prog);
+			break;
+		}
+}
+
+int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
+			struct bpf_prog *exclude_prog,
+			struct bpf_prog *include_prog,
+			struct bpf_prog_array **new_array)
+{
+	int new_prog_cnt, carry_prog_cnt = 0;
+	struct bpf_prog **existing_prog;
+	struct bpf_prog_array *array;
+	int new_prog_idx = 0;
+
+	/* Figure out how many existing progs we need to carry over to
+	 * the new array.
+	 */
+	if (old_array) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++) {
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				carry_prog_cnt++;
+			if (*existing_prog == include_prog)
+				return -EEXIST;
+		}
+	}
+
+	/* How many progs (not NULL) will be in the new array? */
+	new_prog_cnt = carry_prog_cnt;
+	if (include_prog)
+		new_prog_cnt += 1;
+
+	/* Do we have any prog (not NULL) in the new array? */
+	if (!new_prog_cnt) {
+		*new_array = NULL;
+		return 0;
+	}
+
+	/* +1 as the end of prog_array is marked with NULL */
+	array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL);
+	if (!array)
+		return -ENOMEM;
+
+	/* Fill in the new prog array */
+	if (carry_prog_cnt) {
+		existing_prog = old_array->progs;
+		for (; *existing_prog; existing_prog++)
+			if (*existing_prog != exclude_prog &&
+			    *existing_prog != &dummy_bpf_prog.prog)
+				array->progs[new_prog_idx++] = *existing_prog;
+	}
+	if (include_prog)
+		array->progs[new_prog_idx++] = include_prog;
+	array->progs[new_prog_idx] = NULL;
+	*new_array = array;
+	return 0;
+}
+
 static void bpf_prog_free_deferred(struct work_struct *work)
 {
 	struct bpf_prog_aux *aux;
 
 	aux = container_of(work, struct bpf_prog_aux, work);
+	if (bpf_prog_is_dev_bound(aux))
+		bpf_prog_offload_destroy(aux->prog);
 	bpf_jit_free(aux->prog);
 }
 
@@ -1498,5 +1664,8 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
 
+/* These are only used within the BPF_SYSCALL code */
+#ifdef CONFIG_BPF_SYSCALL
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
 EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
+#endif
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
new file mode 100644
index 000000000000..ce5b669003b2
--- /dev/null
+++ b/kernel/bpf/cpumap.c
@@ -0,0 +1,706 @@
+/* bpf/cpumap.c
+ *
+ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
+ * Released under terms in GPL version 2.  See COPYING.
+ */
+
+/* The 'cpumap' is primarily used as a backend map for XDP BPF helper
+ * call bpf_redirect_map() and XDP_REDIRECT action, like 'devmap'.
+ *
+ * Unlike devmap which redirects XDP frames out another NIC device,
+ * this map type redirects raw XDP frames to another CPU.  The remote
+ * CPU will do SKB-allocation and call the normal network stack.
+ *
+ * This is a scalability and isolation mechanism, that allow
+ * separating the early driver network XDP layer, from the rest of the
+ * netstack, and assigning dedicated CPUs for this stage.  This
+ * basically allows for 10G wirespeed pre-filtering via bpf.
+ */
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <linux/ptr_ring.h>
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/capability.h>
+#include <trace/events/xdp.h>
+
+#include <linux/netdevice.h>   /* netif_receive_skb_core */
+#include <linux/etherdevice.h> /* eth_type_trans */
+
+/* General idea: XDP packets getting XDP redirected to another CPU,
+ * will maximum be stored/queued for one driver ->poll() call.  It is
+ * guaranteed that setting flush bit and flush operation happen on
+ * same CPU.  Thus, cpu_map_flush operation can deduct via this_cpu_ptr()
+ * which queue in bpf_cpu_map_entry contains packets.
+ */
+
+#define CPU_MAP_BULK_SIZE 8  /* 8 == one cacheline on 64-bit archs */
+struct xdp_bulk_queue {
+	void *q[CPU_MAP_BULK_SIZE];
+	unsigned int count;
+};
+
+/* Struct for every remote "destination" CPU in map */
+struct bpf_cpu_map_entry {
+	u32 cpu;    /* kthread CPU and map index */
+	int map_id; /* Back reference to map */
+	u32 qsize;  /* Queue size placeholder for map lookup */
+
+	/* XDP can run multiple RX-ring queues, need __percpu enqueue store */
+	struct xdp_bulk_queue __percpu *bulkq;
+
+	/* Queue with potential multi-producers, and single-consumer kthread */
+	struct ptr_ring *queue;
+	struct task_struct *kthread;
+	struct work_struct kthread_stop_wq;
+
+	atomic_t refcnt; /* Control when this struct can be free'ed */
+	struct rcu_head rcu;
+};
+
+struct bpf_cpu_map {
+	struct bpf_map map;
+	/* Below members specific for map type */
+	struct bpf_cpu_map_entry **cpu_map;
+	unsigned long __percpu *flush_needed;
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq);
+
+static u64 cpu_map_bitmap_size(const union bpf_attr *attr)
+{
+	return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long);
+}
+
+static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
+{
+	struct bpf_cpu_map *cmap;
+	int err = -ENOMEM;
+	u64 cost;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	/* check sanity of attributes */
+	if (attr->max_entries == 0 || attr->key_size != 4 ||
+	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+		return ERR_PTR(-EINVAL);
+
+	cmap = kzalloc(sizeof(*cmap), GFP_USER);
+	if (!cmap)
+		return ERR_PTR(-ENOMEM);
+
+	/* mandatory map attributes */
+	cmap->map.map_type = attr->map_type;
+	cmap->map.key_size = attr->key_size;
+	cmap->map.value_size = attr->value_size;
+	cmap->map.max_entries = attr->max_entries;
+	cmap->map.map_flags = attr->map_flags;
+	cmap->map.numa_node = bpf_map_attr_numa_node(attr);
+
+	/* Pre-limit array size based on NR_CPUS, not final CPU check */
+	if (cmap->map.max_entries > NR_CPUS) {
+		err = -E2BIG;
+		goto free_cmap;
+	}
+
+	/* make sure page count doesn't overflow */
+	cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *);
+	cost += cpu_map_bitmap_size(attr) * num_possible_cpus();
+	if (cost >= U32_MAX - PAGE_SIZE)
+		goto free_cmap;
+	cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
+
+	/* Notice returns -EPERM on if map size is larger than memlock limit */
+	ret = bpf_map_precharge_memlock(cmap->map.pages);
+	if (ret) {
+		err = ret;
+		goto free_cmap;
+	}
+
+	/* A per cpu bitfield with a bit per possible CPU in map  */
+	cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
+					    __alignof__(unsigned long));
+	if (!cmap->flush_needed)
+		goto free_cmap;
+
+	/* Alloc array for possible remote "destination" CPUs */
+	cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
+					   sizeof(struct bpf_cpu_map_entry *),
+					   cmap->map.numa_node);
+	if (!cmap->cpu_map)
+		goto free_percpu;
+
+	return &cmap->map;
+free_percpu:
+	free_percpu(cmap->flush_needed);
+free_cmap:
+	kfree(cmap);
+	return ERR_PTR(err);
+}
+
+void __cpu_map_queue_destructor(void *ptr)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	if (WARN_ON_ONCE(ptr))
+		page_frag_free(ptr);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
+static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	atomic_inc(&rcpu->refcnt);
+}
+
+/* called from workqueue, to workaround syscall using preempt_disable */
+static void cpu_map_kthread_stop(struct work_struct *work)
+{
+	struct bpf_cpu_map_entry *rcpu;
+
+	rcpu = container_of(work, struct bpf_cpu_map_entry, kthread_stop_wq);
+
+	/* Wait for flush in __cpu_map_entry_free(), via full RCU barrier,
+	 * as it waits until all in-flight call_rcu() callbacks complete.
+	 */
+	rcu_barrier();
+
+	/* kthread_stop will wake_up_process and wait for it to complete */
+	kthread_stop(rcpu->kthread);
+}
+
+/* For now, xdp_pkt is a cpumap internal data structure, with info
+ * carried between enqueue to dequeue. It is mapped into the top
+ * headroom of the packet, to avoid allocating separate mem.
+ */
+struct xdp_pkt {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	struct net_device *dev_rx;
+};
+
+/* Convert xdp_buff to xdp_pkt */
+static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
+{
+	struct xdp_pkt *xdp_pkt;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_pkt = xdp->data_hard_start;
+
+	xdp_pkt->data = xdp->data;
+	xdp_pkt->len  = xdp->data_end - xdp->data;
+	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
+	xdp_pkt->metasize = metasize;
+
+	return xdp_pkt;
+}
+
+struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
+				  struct xdp_pkt *xdp_pkt)
+{
+	unsigned int frame_size;
+	void *pkt_data_start;
+	struct sk_buff *skb;
+
+	/* build_skb need to place skb_shared_info after SKB end, and
+	 * also want to know the memory "truesize".  Thus, need to
+	 * know the memory frame size backing xdp_buff.
+	 *
+	 * XDP was designed to have PAGE_SIZE frames, but this
+	 * assumption is not longer true with ixgbe and i40e.  It
+	 * would be preferred to set frame_size to 2048 or 4096
+	 * depending on the driver.
+	 *   frame_size = 2048;
+	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *
+	 * Instead, with info avail, skb_shared_info in placed after
+	 * packet len.  This, unfortunately fakes the truesize.
+	 * Another disadvantage of this approach, the skb_shared_info
+	 * is not at a fixed memory location, with mixed length
+	 * packets, which is bad for cache-line hotness.
+	 */
+	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	skb = build_skb(pkt_data_start, frame_size);
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, xdp_pkt->headroom);
+	__skb_put(skb, xdp_pkt->len);
+	if (xdp_pkt->metasize)
+		skb_metadata_set(skb, xdp_pkt->metasize);
+
+	/* Essential SKB info: protocol and skb->dev */
+	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+
+	/* Optional SKB info, currently missing:
+	 * - HW checksum info		(skb->ip_summed)
+	 * - HW RX hash			(skb_set_hash)
+	 * - RX ring dev queue index	(skb_record_rx_queue)
+	 */
+
+	return skb;
+}
+
+static int cpu_map_kthread_run(void *data)
+{
+	struct bpf_cpu_map_entry *rcpu = data;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	/* When kthread gives stop order, then rcpu have been disconnected
+	 * from map, thus no new packets can enter. Remaining in-flight
+	 * per CPU stored packets are flushed to this queue.  Wait honoring
+	 * kthread_stop signal until queue is empty.
+	 */
+	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
+		unsigned int processed = 0, drops = 0, sched = 0;
+		struct xdp_pkt *xdp_pkt;
+
+		/* Release CPU reschedule checks */
+		if (__ptr_ring_empty(rcpu->queue)) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			/* Recheck to avoid lost wake-up */
+			if (__ptr_ring_empty(rcpu->queue)) {
+				schedule();
+				sched = 1;
+			} else {
+				__set_current_state(TASK_RUNNING);
+			}
+		} else {
+			sched = cond_resched();
+		}
+
+		/* Process packets in rcpu->queue */
+		local_bh_disable();
+		/*
+		 * The bpf_cpu_map_entry is single consumer, with this
+		 * kthread CPU pinned. Lockless access to ptr_ring
+		 * consume side valid as no-resize allowed of queue.
+		 */
+		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+			struct sk_buff *skb;
+			int ret;
+
+			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			if (!skb) {
+				page_frag_free(xdp_pkt);
+				continue;
+			}
+
+			/* Inject into network stack */
+			ret = netif_receive_skb_core(skb);
+			if (ret == NET_RX_DROP)
+				drops++;
+
+			/* Limit BH-disable period */
+			if (++processed == 8)
+				break;
+		}
+		/* Feedback loop via tracepoint */
+		trace_xdp_cpumap_kthread(rcpu->map_id, processed, drops, sched);
+
+		local_bh_enable(); /* resched point, may call do_softirq() */
+	}
+	__set_current_state(TASK_RUNNING);
+
+	put_cpu_map_entry(rcpu);
+	return 0;
+}
+
+struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id)
+{
+	gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
+	struct bpf_cpu_map_entry *rcpu;
+	int numa, err;
+
+	/* Have map->numa_node, but choose node of redirect target CPU */
+	numa = cpu_to_node(cpu);
+
+	rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa);
+	if (!rcpu)
+		return NULL;
+
+	/* Alloc percpu bulkq */
+	rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq),
+					 sizeof(void *), gfp);
+	if (!rcpu->bulkq)
+		goto free_rcu;
+
+	/* Alloc queue */
+	rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa);
+	if (!rcpu->queue)
+		goto free_bulkq;
+
+	err = ptr_ring_init(rcpu->queue, qsize, gfp);
+	if (err)
+		goto free_queue;
+
+	rcpu->cpu    = cpu;
+	rcpu->map_id = map_id;
+	rcpu->qsize  = qsize;
+
+	/* Setup kthread */
+	rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa,
+					       "cpumap/%d/map:%d", cpu, map_id);
+	if (IS_ERR(rcpu->kthread))
+		goto free_ptr_ring;
+
+	get_cpu_map_entry(rcpu); /* 1-refcnt for being in cmap->cpu_map[] */
+	get_cpu_map_entry(rcpu); /* 1-refcnt for kthread */
+
+	/* Make sure kthread runs on a single CPU */
+	kthread_bind(rcpu->kthread, cpu);
+	wake_up_process(rcpu->kthread);
+
+	return rcpu;
+
+free_ptr_ring:
+	ptr_ring_cleanup(rcpu->queue, NULL);
+free_queue:
+	kfree(rcpu->queue);
+free_bulkq:
+	free_percpu(rcpu->bulkq);
+free_rcu:
+	kfree(rcpu);
+	return NULL;
+}
+
+void __cpu_map_entry_free(struct rcu_head *rcu)
+{
+	struct bpf_cpu_map_entry *rcpu;
+	int cpu;
+
+	/* This cpu_map_entry have been disconnected from map and one
+	 * RCU graze-period have elapsed.  Thus, XDP cannot queue any
+	 * new packets and cannot change/set flush_needed that can
+	 * find this entry.
+	 */
+	rcpu = container_of(rcu, struct bpf_cpu_map_entry, rcu);
+
+	/* Flush remaining packets in percpu bulkq */
+	for_each_online_cpu(cpu) {
+		struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu);
+
+		/* No concurrent bq_enqueue can run at this point */
+		bq_flush_to_queue(rcpu, bq);
+	}
+	free_percpu(rcpu->bulkq);
+	/* Cannot kthread_stop() here, last put free rcpu resources */
+	put_cpu_map_entry(rcpu);
+}
+
+/* After xchg pointer to bpf_cpu_map_entry, use the call_rcu() to
+ * ensure any driver rcu critical sections have completed, but this
+ * does not guarantee a flush has happened yet. Because driver side
+ * rcu_read_lock/unlock only protects the running XDP program.  The
+ * atomic xchg and NULL-ptr check in __cpu_map_flush() makes sure a
+ * pending flush op doesn't fail.
+ *
+ * The bpf_cpu_map_entry is still used by the kthread, and there can
+ * still be pending packets (in queue and percpu bulkq).  A refcnt
+ * makes sure to last user (kthread_stop vs. call_rcu) free memory
+ * resources.
+ *
+ * The rcu callback __cpu_map_entry_free flush remaining packets in
+ * percpu bulkq to queue.  Due to caller map_delete_elem() disable
+ * preemption, cannot call kthread_stop() to make sure queue is empty.
+ * Instead a work_queue is started for stopping kthread,
+ * cpu_map_kthread_stop, which waits for an RCU graze period before
+ * stopping kthread, emptying the queue.
+ */
+void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
+			     u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
+{
+	struct bpf_cpu_map_entry *old_rcpu;
+
+	old_rcpu = xchg(&cmap->cpu_map[key_cpu], rcpu);
+	if (old_rcpu) {
+		call_rcu(&old_rcpu->rcu, __cpu_map_entry_free);
+		INIT_WORK(&old_rcpu->kthread_stop_wq, cpu_map_kthread_stop);
+		schedule_work(&old_rcpu->kthread_stop_wq);
+	}
+}
+
+int cpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 key_cpu = *(u32 *)key;
+
+	if (key_cpu >= map->max_entries)
+		return -EINVAL;
+
+	/* notice caller map_delete_elem() use preempt_disable() */
+	__cpu_map_entry_replace(cmap, key_cpu, NULL);
+	return 0;
+}
+
+int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
+				u64 map_flags)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	/* Array index key correspond to CPU number */
+	u32 key_cpu = *(u32 *)key;
+	/* Value is the queue size */
+	u32 qsize = *(u32 *)value;
+
+	if (unlikely(map_flags > BPF_EXIST))
+		return -EINVAL;
+	if (unlikely(key_cpu >= cmap->map.max_entries))
+		return -E2BIG;
+	if (unlikely(map_flags == BPF_NOEXIST))
+		return -EEXIST;
+	if (unlikely(qsize > 16384)) /* sanity limit on qsize */
+		return -EOVERFLOW;
+
+	/* Make sure CPU is a valid possible cpu */
+	if (!cpu_possible(key_cpu))
+		return -ENODEV;
+
+	if (qsize == 0) {
+		rcpu = NULL; /* Same as deleting */
+	} else {
+		/* Updating qsize cause re-allocation of bpf_cpu_map_entry */
+		rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id);
+		if (!rcpu)
+			return -ENOMEM;
+	}
+	rcu_read_lock();
+	__cpu_map_entry_replace(cmap, key_cpu, rcpu);
+	rcu_read_unlock();
+	return 0;
+}
+
+void cpu_map_free(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	int cpu;
+	u32 i;
+
+	/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
+	 * so the bpf programs (can be more than one that used this map) were
+	 * disconnected from events. Wait for outstanding critical sections in
+	 * these programs to complete. The rcu critical section only guarantees
+	 * no further "XDP/bpf-side" reads against bpf_cpu_map->cpu_map.
+	 * It does __not__ ensure pending flush operations (if any) are
+	 * complete.
+	 */
+	synchronize_rcu();
+
+	/* To ensure all pending flush operations have completed wait for flush
+	 * bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
+	 * Because the above synchronize_rcu() ensures the map is disconnected
+	 * from the program we can assume no new bits will be set.
+	 */
+	for_each_online_cpu(cpu) {
+		unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu);
+
+		while (!bitmap_empty(bitmap, cmap->map.max_entries))
+			cond_resched();
+	}
+
+	/* For cpu_map the remote CPUs can still be using the entries
+	 * (struct bpf_cpu_map_entry).
+	 */
+	for (i = 0; i < cmap->map.max_entries; i++) {
+		struct bpf_cpu_map_entry *rcpu;
+
+		rcpu = READ_ONCE(cmap->cpu_map[i]);
+		if (!rcpu)
+			continue;
+
+		/* bq flush and cleanup happens after RCU graze-period */
+		__cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */
+	}
+	free_percpu(cmap->flush_needed);
+	bpf_map_area_free(cmap->cpu_map);
+	kfree(cmap);
+}
+
+struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	struct bpf_cpu_map_entry *rcpu;
+
+	if (key >= map->max_entries)
+		return NULL;
+
+	rcpu = READ_ONCE(cmap->cpu_map[key]);
+	return rcpu;
+}
+
+static void *cpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+	struct bpf_cpu_map_entry *rcpu =
+		__cpu_map_lookup_elem(map, *(u32 *)key);
+
+	return rcpu ? &rcpu->qsize : NULL;
+}
+
+static int cpu_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	u32 index = key ? *(u32 *)key : U32_MAX;
+	u32 *next = next_key;
+
+	if (index >= cmap->map.max_entries) {
+		*next = 0;
+		return 0;
+	}
+
+	if (index == cmap->map.max_entries - 1)
+		return -ENOENT;
+	*next = index + 1;
+	return 0;
+}
+
+const struct bpf_map_ops cpu_map_ops = {
+	.map_alloc		= cpu_map_alloc,
+	.map_free		= cpu_map_free,
+	.map_delete_elem	= cpu_map_delete_elem,
+	.map_update_elem	= cpu_map_update_elem,
+	.map_lookup_elem	= cpu_map_lookup_elem,
+	.map_get_next_key	= cpu_map_get_next_key,
+};
+
+static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
+			     struct xdp_bulk_queue *bq)
+{
+	unsigned int processed = 0, drops = 0;
+	const int to_cpu = rcpu->cpu;
+	struct ptr_ring *q;
+	int i;
+
+	if (unlikely(!bq->count))
+		return 0;
+
+	q = rcpu->queue;
+	spin_lock(&q->producer_lock);
+
+	for (i = 0; i < bq->count; i++) {
+		void *xdp_pkt = bq->q[i];
+		int err;
+
+		err = __ptr_ring_produce(q, xdp_pkt);
+		if (err) {
+			drops++;
+			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+		}
+		processed++;
+	}
+	bq->count = 0;
+	spin_unlock(&q->producer_lock);
+
+	/* Feedback loop via tracepoints */
+	trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu);
+	return 0;
+}
+
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+{
+	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
+
+	if (unlikely(bq->count == CPU_MAP_BULK_SIZE))
+		bq_flush_to_queue(rcpu, bq);
+
+	/* Notice, xdp_buff/page MUST be queued here, long enough for
+	 * driver to code invoking us to finished, due to driver
+	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
+	 *
+	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * with another CPU on page-refcnt and remaining driver code).
+	 * Queue time is very short, as driver will invoke flush
+	 * operation, when completing napi->poll call.
+	 */
+	bq->q[bq->count++] = xdp_pkt;
+	return 0;
+}
+
+int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
+		    struct net_device *dev_rx)
+{
+	struct xdp_pkt *xdp_pkt;
+
+	xdp_pkt = convert_to_xdp_pkt(xdp);
+	if (unlikely(!xdp_pkt))
+		return -EOVERFLOW;
+
+	/* Info needed when constructing SKB on remote CPU */
+	xdp_pkt->dev_rx = dev_rx;
+
+	bq_enqueue(rcpu, xdp_pkt);
+	return 0;
+}
+
+void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+
+	__set_bit(bit, bitmap);
+}
+
+void __cpu_map_flush(struct bpf_map *map)
+{
+	struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
+	unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed);
+	u32 bit;
+
+	/* The napi->poll softirq makes sure __cpu_map_insert_ctx()
+	 * and __cpu_map_flush() happen on same CPU. Thus, the percpu
+	 * bitmap indicate which percpu bulkq have packets.
+	 */
+	for_each_set_bit(bit, bitmap, map->max_entries) {
+		struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]);
+		struct xdp_bulk_queue *bq;
+
+		/* This is possible if entry is removed by user space
+		 * between xdp redirect and flush op.
+		 */
+		if (unlikely(!rcpu))
+			continue;
+
+		__clear_bit(bit, bitmap);
+
+		/* Flush all frames in bulkq to real queue */
+		bq = this_cpu_ptr(rcpu->bulkq);
+		bq_flush_to_queue(rcpu, bq);
+
+		/* If already running, costs spin_lock_irqsave + smb_mb */
+		wake_up_process(rcpu->kthread);
+	}
+}
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index e745d6a88224..ebdef54bf7df 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 
+#define DEV_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_dtab_netdev {
 	struct net_device *dev;
 	struct bpf_dtab *dtab;
@@ -83,7 +86,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c
new file mode 100644
index 000000000000..e682850c9715
--- /dev/null
+++ b/kernel/bpf/disasm.c
@@ -0,0 +1,214 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bpf.h>
+
+#include "disasm.h"
+
+#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
+static const char * const func_id_str[] = {
+	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
+};
+#undef __BPF_FUNC_STR_FN
+
+const char *func_id_name(int id)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
+
+	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
+		return func_id_str[id];
+	else
+		return "unknown";
+}
+
+const char *const bpf_class_string[8] = {
+	[BPF_LD]    = "ld",
+	[BPF_LDX]   = "ldx",
+	[BPF_ST]    = "st",
+	[BPF_STX]   = "stx",
+	[BPF_ALU]   = "alu",
+	[BPF_JMP]   = "jmp",
+	[BPF_RET]   = "BUG",
+	[BPF_ALU64] = "alu64",
+};
+
+const char *const bpf_alu_string[16] = {
+	[BPF_ADD >> 4]  = "+=",
+	[BPF_SUB >> 4]  = "-=",
+	[BPF_MUL >> 4]  = "*=",
+	[BPF_DIV >> 4]  = "/=",
+	[BPF_OR  >> 4]  = "|=",
+	[BPF_AND >> 4]  = "&=",
+	[BPF_LSH >> 4]  = "<<=",
+	[BPF_RSH >> 4]  = ">>=",
+	[BPF_NEG >> 4]  = "neg",
+	[BPF_MOD >> 4]  = "%=",
+	[BPF_XOR >> 4]  = "^=",
+	[BPF_MOV >> 4]  = "=",
+	[BPF_ARSH >> 4] = "s>>=",
+	[BPF_END >> 4]  = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+	[BPF_W >> 3]  = "u32",
+	[BPF_H >> 3]  = "u16",
+	[BPF_B >> 3]  = "u8",
+	[BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[16] = {
+	[BPF_JA >> 4]   = "jmp",
+	[BPF_JEQ >> 4]  = "==",
+	[BPF_JGT >> 4]  = ">",
+	[BPF_JLT >> 4]  = "<",
+	[BPF_JGE >> 4]  = ">=",
+	[BPF_JLE >> 4]  = "<=",
+	[BPF_JSET >> 4] = "&",
+	[BPF_JNE >> 4]  = "!=",
+	[BPF_JSGT >> 4] = "s>",
+	[BPF_JSLT >> 4] = "s<",
+	[BPF_JSGE >> 4] = "s>=",
+	[BPF_JSLE >> 4] = "s<=",
+	[BPF_CALL >> 4] = "call",
+	[BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_end_insn(bpf_insn_print_cb verbose,
+			       struct bpf_verifier_env *env,
+			       const struct bpf_insn *insn)
+{
+	verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg,
+		BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le",
+		insn->imm, insn->dst_reg);
+}
+
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn, bool allow_ptr_leaks)
+{
+	u8 class = BPF_CLASS(insn->code);
+
+	if (class == BPF_ALU || class == BPF_ALU64) {
+		if (BPF_OP(insn->code) == BPF_END) {
+			if (class == BPF_ALU64)
+				verbose(env, "BUG_alu64_%02x\n", insn->code);
+			else
+				print_bpf_end_insn(verbose, env, insn);
+		} else if (BPF_OP(insn->code) == BPF_NEG) {
+			verbose(env, "(%02x) r%d = %s-r%d\n",
+				insn->code, insn->dst_reg,
+				class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose(env, "(%02x) %sr%d %s %sr%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->src_reg);
+		} else {
+			verbose(env, "(%02x) %sr%d %s %s%d\n",
+				insn->code, class == BPF_ALU ? "(u32) " : "",
+				insn->dst_reg,
+				bpf_alu_string[BPF_OP(insn->code) >> 4],
+				class == BPF_ALU ? "(u32) " : "",
+				insn->imm);
+		}
+	} else if (class == BPF_STX) {
+		if (BPF_MODE(insn->code) == BPF_MEM)
+			verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg,
+				insn->off, insn->src_reg);
+		else if (BPF_MODE(insn->code) == BPF_XADD)
+			verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->dst_reg, insn->off,
+				insn->src_reg);
+		else
+			verbose(env, "BUG_%02x\n", insn->code);
+	} else if (class == BPF_ST) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose(env, "BUG_st_%02x\n", insn->code);
+			return;
+		}
+		verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n",
+			insn->code,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->dst_reg,
+			insn->off, insn->imm);
+	} else if (class == BPF_LDX) {
+		if (BPF_MODE(insn->code) != BPF_MEM) {
+			verbose(env, "BUG_ldx_%02x\n", insn->code);
+			return;
+		}
+		verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n",
+			insn->code, insn->dst_reg,
+			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+			insn->src_reg, insn->off);
+	} else if (class == BPF_LD) {
+		if (BPF_MODE(insn->code) == BPF_ABS) {
+			verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IND) {
+			verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+				insn->code,
+				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+				insn->src_reg, insn->imm);
+		} else if (BPF_MODE(insn->code) == BPF_IMM &&
+			   BPF_SIZE(insn->code) == BPF_DW) {
+			/* At this point, we already made sure that the second
+			 * part of the ldimm64 insn is accessible.
+			 */
+			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+			if (map_ptr && !allow_ptr_leaks)
+				imm = 0;
+
+			verbose(env, "(%02x) r%d = 0x%llx\n", insn->code,
+				insn->dst_reg, (unsigned long long)imm);
+		} else {
+			verbose(env, "BUG_ld_%02x\n", insn->code);
+			return;
+		}
+	} else if (class == BPF_JMP) {
+		u8 opcode = BPF_OP(insn->code);
+
+		if (opcode == BPF_CALL) {
+			verbose(env, "(%02x) call %s#%d\n", insn->code,
+				func_id_name(insn->imm), insn->imm);
+		} else if (insn->code == (BPF_JMP | BPF_JA)) {
+			verbose(env, "(%02x) goto pc%+d\n",
+				insn->code, insn->off);
+		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+			verbose(env, "(%02x) exit\n", insn->code);
+		} else if (BPF_SRC(insn->code) == BPF_X) {
+			verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->src_reg, insn->off);
+		} else {
+			verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n",
+				insn->code, insn->dst_reg,
+				bpf_jmp_string[BPF_OP(insn->code) >> 4],
+				insn->imm, insn->off);
+		}
+	} else {
+		verbose(env, "(%02x) %s\n",
+			insn->code, bpf_class_string[class]);
+	}
+}
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
new file mode 100644
index 000000000000..8de977e420b6
--- /dev/null
+++ b/kernel/bpf/disasm.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ * Copyright (c) 2016 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef __BPF_DISASM_H__
+#define __BPF_DISASM_H__
+
+#include <linux/bpf.h>
+#include <linux/kernel.h>
+#include <linux/stringify.h>
+
+extern const char *const bpf_alu_string[16];
+extern const char *const bpf_class_string[8];
+
+const char *func_id_name(int id);
+
+struct bpf_verifier_env;
+typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env,
+				  const char *, ...);
+void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env,
+		    const struct bpf_insn *insn, bool allow_ptr_leaks);
+
+#endif
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 6533f08d1238..e469e05c8e83 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
 #include "bpf_lru_list.h"
 #include "map_in_map.h"
 
-#define HTAB_CREATE_FLAG_MASK \
-	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK						\
+	(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |	\
+	 BPF_F_RDONLY | BPF_F_WRONLY)
 
 struct bucket {
 	struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..01aaef1a77c5 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
 }
 
 static void *bpf_obj_do_get(const struct filename *pathname,
-			    enum bpf_type *type)
+			    enum bpf_type *type, int flags)
 {
 	struct inode *inode;
 	struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
 		return ERR_PTR(ret);
 
 	inode = d_backing_inode(path.dentry);
-	ret = inode_permission(inode, MAY_WRITE);
+	ret = inode_permission(inode, ACC_MODE(flags));
 	if (ret)
 		goto out;
 
@@ -326,18 +326,23 @@ out:
 	return ERR_PTR(ret);
 }
 
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	enum bpf_type type = BPF_TYPE_UNSPEC;
 	struct filename *pname;
 	int ret = -ENOENT;
+	int f_flags;
 	void *raw;
 
+	f_flags = bpf_get_file_flag(flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	pname = getname(pathname);
 	if (IS_ERR(pname))
 		return PTR_ERR(pname);
 
-	raw = bpf_obj_do_get(pname, &type);
+	raw = bpf_obj_do_get(pname, &type, f_flags);
 	if (IS_ERR(raw)) {
 		ret = PTR_ERR(raw);
 		goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
 	if (type == BPF_TYPE_PROG)
 		ret = bpf_prog_new_fd(raw);
 	else if (type == BPF_TYPE_MAP)
-		ret = bpf_map_new_fd(raw);
+		ret = bpf_map_new_fd(raw, f_flags);
 	else
 		goto out;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 1b767844a76f..885e45479680 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -389,10 +389,99 @@ out:
 	return ret;
 }
 
-static int trie_delete_elem(struct bpf_map *map, void *key)
+/* Called from syscall or from eBPF program */
+static int trie_delete_elem(struct bpf_map *map, void *_key)
 {
-	/* TODO */
-	return -ENOSYS;
+	struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
+	struct bpf_lpm_trie_key *key = _key;
+	struct lpm_trie_node __rcu **trim, **trim2;
+	struct lpm_trie_node *node, *parent;
+	unsigned long irq_flags;
+	unsigned int next_bit;
+	size_t matchlen = 0;
+	int ret = 0;
+
+	if (key->prefixlen > trie->max_prefixlen)
+		return -EINVAL;
+
+	raw_spin_lock_irqsave(&trie->lock, irq_flags);
+
+	/* Walk the tree looking for an exact key/length match and keeping
+	 * track of the path we traverse.  We will need to know the node
+	 * we wish to delete, and the slot that points to the node we want
+	 * to delete.  We may also need to know the nodes parent and the
+	 * slot that contains it.
+	 */
+	trim = &trie->root;
+	trim2 = trim;
+	parent = NULL;
+	while ((node = rcu_dereference_protected(
+		       *trim, lockdep_is_held(&trie->lock)))) {
+		matchlen = longest_prefix_match(trie, node, key);
+
+		if (node->prefixlen != matchlen ||
+		    node->prefixlen == key->prefixlen)
+			break;
+
+		parent = node;
+		trim2 = trim;
+		next_bit = extract_bit(key->data, node->prefixlen);
+		trim = &node->child[next_bit];
+	}
+
+	if (!node || node->prefixlen != key->prefixlen ||
+	    (node->flags & LPM_TREE_NODE_FLAG_IM)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	trie->n_entries--;
+
+	/* If the node we are removing has two children, simply mark it
+	 * as intermediate and we are done.
+	 */
+	if (rcu_access_pointer(node->child[0]) &&
+	    rcu_access_pointer(node->child[1])) {
+		node->flags |= LPM_TREE_NODE_FLAG_IM;
+		goto out;
+	}
+
+	/* If the parent of the node we are about to delete is an intermediate
+	 * node, and the deleted node doesn't have any children, we can delete
+	 * the intermediate parent as well and promote its other child
+	 * up the tree.  Doing this maintains the invariant that all
+	 * intermediate nodes have exactly 2 children and that there are no
+	 * unnecessary intermediate nodes in the tree.
+	 */
+	if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) &&
+	    !node->child[0] && !node->child[1]) {
+		if (node == rcu_access_pointer(parent->child[0]))
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[1]));
+		else
+			rcu_assign_pointer(
+				*trim2, rcu_access_pointer(parent->child[0]));
+		kfree_rcu(parent, rcu);
+		kfree_rcu(node, rcu);
+		goto out;
+	}
+
+	/* The node we are removing has either zero or one child. If there
+	 * is a child, move it into the removed node's slot then delete
+	 * the node.  Otherwise just clear the slot and delete the node.
+	 */
+	if (node->child[0])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0]));
+	else if (node->child[1])
+		rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1]));
+	else
+		RCU_INIT_POINTER(*trim, NULL);
+	kfree_rcu(node, rcu);
+
+out:
+	raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
+
+	return ret;
 }
 
 #define LPM_DATA_SIZE_MAX	256
@@ -406,7 +495,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
 #define LPM_KEY_SIZE_MAX	LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
 #define LPM_KEY_SIZE_MIN	LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
 
-#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK	(BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |	\
+				 BPF_F_RDONLY | BPF_F_WRONLY)
 
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
new file mode 100644
index 000000000000..8455b89d1bbf
--- /dev/null
+++ b/kernel/bpf/offload.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2017 Netronome Systems, Inc.
+ *
+ * This software is licensed under the GNU General License Version 2,
+ * June 1991 as shown in the file COPYING in the top-level directory of this
+ * source tree.
+ *
+ * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS"
+ * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING,
+ * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE
+ * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME
+ * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+ */
+
+#include <linux/bpf.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/printk.h>
+#include <linux/rtnetlink.h>
+
+/* protected by RTNL */
+static LIST_HEAD(bpf_prog_offload_devs);
+
+int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_dev_offload *offload;
+
+	if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
+	    attr->prog_type != BPF_PROG_TYPE_XDP)
+		return -EINVAL;
+
+	if (attr->prog_flags)
+		return -EINVAL;
+
+	offload = kzalloc(sizeof(*offload), GFP_USER);
+	if (!offload)
+		return -ENOMEM;
+
+	offload->prog = prog;
+	init_waitqueue_head(&offload->verifier_done);
+
+	rtnl_lock();
+	offload->netdev = __dev_get_by_index(net, attr->prog_ifindex);
+	if (!offload->netdev) {
+		rtnl_unlock();
+		kfree(offload);
+		return -EINVAL;
+	}
+
+	prog->aux->offload = offload;
+	list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
+	rtnl_unlock();
+
+	return 0;
+}
+
+static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
+			     struct netdev_bpf *data)
+{
+	struct net_device *netdev = prog->aux->offload->netdev;
+
+	ASSERT_RTNL();
+
+	if (!netdev)
+		return -ENODEV;
+	if (!netdev->netdev_ops->ndo_bpf)
+		return -EOPNOTSUPP;
+
+	data->command = cmd;
+
+	return netdev->netdev_ops->ndo_bpf(netdev, data);
+}
+
+int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env)
+{
+	struct netdev_bpf data = {};
+	int err;
+
+	data.verifier.prog = env->prog;
+
+	rtnl_lock();
+	err = __bpf_offload_ndo(env->prog, BPF_OFFLOAD_VERIFIER_PREP, &data);
+	if (err)
+		goto exit_unlock;
+
+	env->dev_ops = data.verifier.ops;
+
+	env->prog->aux->offload->dev_state = true;
+	env->prog->aux->offload->verifier_running = true;
+exit_unlock:
+	rtnl_unlock();
+	return err;
+}
+
+static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+
+	/* Caution - if netdev is destroyed before the program, this function
+	 * will be called twice.
+	 */
+
+	data.offload.prog = prog;
+
+	if (offload->verifier_running)
+		wait_event(offload->verifier_done, !offload->verifier_running);
+
+	if (offload->dev_state)
+		WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data));
+
+	offload->dev_state = false;
+	list_del_init(&offload->offloads);
+	offload->netdev = NULL;
+}
+
+void bpf_prog_offload_destroy(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	__bpf_prog_offload_destroy(prog);
+	rtnl_unlock();
+
+	kfree(offload);
+}
+
+static int bpf_prog_offload_translate(struct bpf_prog *prog)
+{
+	struct bpf_dev_offload *offload = prog->aux->offload;
+	struct netdev_bpf data = {};
+	int ret;
+
+	data.offload.prog = prog;
+
+	offload->verifier_running = false;
+	wake_up(&offload->verifier_done);
+
+	rtnl_lock();
+	ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data);
+	rtnl_unlock();
+
+	return ret;
+}
+
+static unsigned int bpf_prog_warn_on_exec(const void *ctx,
+					  const struct bpf_insn *insn)
+{
+	WARN(1, "attempt to execute device eBPF program on the host!");
+	return 0;
+}
+
+int bpf_prog_offload_compile(struct bpf_prog *prog)
+{
+	prog->bpf_func = bpf_prog_warn_on_exec;
+
+	return bpf_prog_offload_translate(prog);
+}
+
+const struct bpf_prog_ops bpf_offload_prog_ops = {
+};
+
+static int bpf_offload_notification(struct notifier_block *notifier,
+				    ulong event, void *ptr)
+{
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+	struct bpf_dev_offload *offload, *tmp;
+
+	ASSERT_RTNL();
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+		/* ignore namespace changes */
+		if (netdev->reg_state != NETREG_UNREGISTERING)
+			break;
+
+		list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs,
+					 offloads) {
+			if (offload->netdev == netdev)
+				__bpf_prog_offload_destroy(offload->prog);
+		}
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block bpf_offload_notifier = {
+	.notifier_call = bpf_offload_notification,
+};
+
+static int __init bpf_offload_init(void)
+{
+	register_netdevice_notifier(&bpf_offload_notifier);
+	return 0;
+}
+
+subsys_initcall(bpf_offload_init);
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 {
 	struct pcpu_freelist_head *head;
 	struct pcpu_freelist_node *node;
+	unsigned long flags;
 	int orig_cpu, cpu;
 
+	local_irq_save(flags);
 	orig_cpu = cpu = raw_smp_processor_id();
 	while (1) {
 		head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
 		node = head->first;
 		if (node) {
 			head->first = node->next;
-			raw_spin_unlock(&head->lock);
+			raw_spin_unlock_irqrestore(&head->lock, flags);
 			return node;
 		}
 		raw_spin_unlock(&head->lock);
 		cpu = cpumask_next(cpu, cpu_possible_mask);
 		if (cpu >= nr_cpu_ids)
 			cpu = 0;
-		if (cpu == orig_cpu)
+		if (cpu == orig_cpu) {
+			local_irq_restore(flags);
 			return NULL;
+		}
 	}
 }
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index dbd7b322a86b..5ee2e41893d9 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -41,6 +41,9 @@
 #include <net/strparser.h>
 #include <net/tcp.h>
 
+#define SOCK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct bpf_stab {
 	struct bpf_map map;
 	struct sock **sock_map;
@@ -122,7 +125,7 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
 	 */
 	TCP_SKB_CB(skb)->bpf.map = NULL;
 	skb->sk = psock->sock;
-	bpf_compute_data_end_sk_skb(skb);
+	bpf_compute_data_pointers(skb);
 	preempt_disable();
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	preempt_enable();
@@ -385,7 +388,7 @@ static int smap_parse_func_strparser(struct strparser *strp,
 	 * any socket yet.
 	 */
 	skb->sk = psock->sock;
-	bpf_compute_data_end_sk_skb(skb);
+	bpf_compute_data_pointers(skb);
 	rc = (*prog->bpf_func)(skb, prog->insnsi);
 	skb->sk = NULL;
 	rcu_read_unlock();
@@ -508,7 +511,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
 	/* check sanity of attributes */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+	    attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 135be433e9a0..a15bc636cc98 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
 
+#define STACK_CREATE_FLAG_MASK \
+	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
 struct stack_map_bucket {
 	struct pcpu_freelist_node fnode;
 	u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	if (attr->map_flags & ~BPF_F_NUMA_NODE)
+	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
 		return ERR_PTR(-EINVAL);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 25d074920a00..2c4cfeaa8d5e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -23,6 +23,9 @@
 #include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/idr.h>
+#include <linux/cred.h>
+#include <linux/timekeeping.h>
+#include <linux/ctype.h>
 
 #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
 			   (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
@@ -31,6 +34,8 @@
 #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
 #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
 
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
 DEFINE_PER_CPU(int, bpf_prog_active);
 static DEFINE_IDR(prog_idr);
 static DEFINE_SPINLOCK(prog_idr_lock);
@@ -207,6 +212,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
 	struct bpf_map *map = container_of(work, struct bpf_map, work);
 
 	bpf_map_uncharge_memlock(map);
+	security_bpf_map_free(map);
 	/* implementation dependent freeing */
 	map->ops->map_free(map);
 }
@@ -291,17 +297,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+			      loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_READ.
+	 */
+	return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+			       size_t siz, loff_t *ppos)
+{
+	/* We need this handler such that alloc_file() enables
+	 * f_mode with FMODE_CAN_WRITE.
+	 */
+	return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_map_show_fdinfo,
 #endif
 	.release	= bpf_map_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
 {
+	int ret;
+
+	ret = security_bpf_map(map, OPEN_FMODE(flags));
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-				O_RDWR | O_CLOEXEC);
+				flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+	if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+		return -EINVAL;
+	if (flags & BPF_F_RDONLY)
+		return O_RDONLY;
+	if (flags & BPF_F_WRONLY)
+		return O_WRONLY;
+	return O_RDWR;
 }
 
 /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -312,18 +355,46 @@ int bpf_map_new_fd(struct bpf_map *map)
 		   offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
 		   sizeof(attr->CMD##_LAST_FIELD)) != NULL
 
-#define BPF_MAP_CREATE_LAST_FIELD numa_node
+/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes.
+ * Return 0 on success and < 0 on error.
+ */
+static int bpf_obj_name_cpy(char *dst, const char *src)
+{
+	const char *end = src + BPF_OBJ_NAME_LEN;
+
+	memset(dst, 0, BPF_OBJ_NAME_LEN);
+
+	/* Copy all isalnum() and '_' char */
+	while (src < end && *src) {
+		if (!isalnum(*src) && *src != '_')
+			return -EINVAL;
+		*dst++ = *src++;
+	}
+
+	/* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */
+	if (src == end)
+		return -EINVAL;
+
+	return 0;
+}
+
+#define BPF_MAP_CREATE_LAST_FIELD map_name
 /* called via syscall */
 static int map_create(union bpf_attr *attr)
 {
 	int numa_node = bpf_map_attr_numa_node(attr);
 	struct bpf_map *map;
+	int f_flags;
 	int err;
 
 	err = CHECK_ATTR(BPF_MAP_CREATE);
 	if (err)
 		return -EINVAL;
 
+	f_flags = bpf_get_file_flag(attr->map_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	if (numa_node != NUMA_NO_NODE &&
 	    ((unsigned int)numa_node >= nr_node_ids ||
 	     !node_online(numa_node)))
@@ -334,18 +405,26 @@ static int map_create(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	err = bpf_obj_name_cpy(map->name, attr->map_name);
+	if (err)
+		goto free_map_nouncharge;
+
 	atomic_set(&map->refcnt, 1);
 	atomic_set(&map->usercnt, 1);
 
-	err = bpf_map_charge_memlock(map);
+	err = security_bpf_map_alloc(map);
 	if (err)
 		goto free_map_nouncharge;
 
+	err = bpf_map_charge_memlock(map);
+	if (err)
+		goto free_map_sec;
+
 	err = bpf_map_alloc_id(map);
 	if (err)
 		goto free_map;
 
-	err = bpf_map_new_fd(map);
+	err = bpf_map_new_fd(map, f_flags);
 	if (err < 0) {
 		/* failed to allocate fd.
 		 * bpf_map_put() is needed because the above
@@ -362,6 +441,8 @@ static int map_create(union bpf_attr *attr)
 
 free_map:
 	bpf_map_uncharge_memlock(map);
+free_map_sec:
+	security_bpf_map_free(map);
 free_map_nouncharge:
 	map->ops->map_free(map);
 	return err;
@@ -460,6 +541,11 @@ static int map_lookup_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -540,6 +626,11 @@ static int map_update_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -562,6 +653,12 @@ static int map_update_elem(union bpf_attr *attr)
 	if (copy_from_user(value, uvalue, value_size) != 0)
 		goto free_value;
 
+	/* Need to create a kthread, thus must support schedule */
+	if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
+		err = map->ops->map_update_elem(map, key, value, attr->flags);
+		goto out;
+	}
+
 	/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
 	 * inside bpf map update or delete otherwise deadlocks are possible
 	 */
@@ -592,7 +689,7 @@ static int map_update_elem(union bpf_attr *attr)
 	}
 	__this_cpu_dec(bpf_prog_active);
 	preempt_enable();
-
+out:
 	if (!err)
 		trace_bpf_map_update_elem(map, ufd, key, value);
 free_value:
@@ -623,6 +720,11 @@ static int map_delete_elem(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	key = memdup_user(ukey, map->key_size);
 	if (IS_ERR(key)) {
 		err = PTR_ERR(key);
@@ -666,6 +768,11 @@ static int map_get_next_key(union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
+	if (!(f.file->f_mode & FMODE_CAN_READ)) {
+		err = -EPERM;
+		goto err_put;
+	}
+
 	if (ukey) {
 		key = memdup_user(ukey, map->key_size);
 		if (IS_ERR(key)) {
@@ -703,9 +810,9 @@ err_put:
 	return err;
 }
 
-static const struct bpf_verifier_ops * const bpf_prog_types[] = {
-#define BPF_PROG_TYPE(_id, _ops) \
-	[_id] = &_ops,
+static const struct bpf_prog_ops * const bpf_prog_types[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _prog_ops,
 #define BPF_MAP_TYPE(_id, _ops)
 #include <linux/bpf_types.h>
 #undef BPF_PROG_TYPE
@@ -717,7 +824,10 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
 	if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type])
 		return -EINVAL;
 
-	prog->aux->ops = bpf_prog_types[type];
+	if (!bpf_prog_is_dev_bound(prog->aux))
+		prog->aux->ops = bpf_prog_types[type];
+	else
+		prog->aux->ops = &bpf_offload_prog_ops;
 	prog->type = type;
 	return 0;
 }
@@ -820,6 +930,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
 
 	free_used_maps(aux);
 	bpf_prog_uncharge_memlock(aux->prog);
+	security_bpf_prog_free(aux);
 	bpf_prog_free(aux->prog);
 }
 
@@ -867,15 +978,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
 }
 #endif
 
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
 #ifdef CONFIG_PROC_FS
 	.show_fdinfo	= bpf_prog_show_fdinfo,
 #endif
 	.release	= bpf_prog_release,
+	.read		= bpf_dummy_read,
+	.write		= bpf_dummy_write,
 };
 
 int bpf_prog_new_fd(struct bpf_prog *prog)
 {
+	int ret;
+
+	ret = security_bpf_prog(prog);
+	if (ret < 0)
+		return ret;
+
 	return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
 				O_RDWR | O_CLOEXEC);
 }
@@ -938,7 +1057,23 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
 }
 EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero);
 
-static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
+static bool bpf_prog_get_ok(struct bpf_prog *prog,
+			    enum bpf_prog_type *attach_type, bool attach_drv)
+{
+	/* not an attachment, just a refcount inc, always allow */
+	if (!attach_type)
+		return true;
+
+	if (prog->type != *attach_type)
+		return false;
+	if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv)
+		return false;
+
+	return true;
+}
+
+static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type,
+				       bool attach_drv)
 {
 	struct fd f = fdget(ufd);
 	struct bpf_prog *prog;
@@ -946,7 +1081,7 @@ static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
 	prog = ____bpf_prog_get(f);
 	if (IS_ERR(prog))
 		return prog;
-	if (type && prog->type != *type) {
+	if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) {
 		prog = ERR_PTR(-EINVAL);
 		goto out;
 	}
@@ -959,21 +1094,22 @@ out:
 
 struct bpf_prog *bpf_prog_get(u32 ufd)
 {
-	return __bpf_prog_get(ufd, NULL);
+	return __bpf_prog_get(ufd, NULL, false);
 }
 
-struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
+struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type,
+				       bool attach_drv)
 {
-	struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
+	struct bpf_prog *prog = __bpf_prog_get(ufd, &type, attach_drv);
 
 	if (!IS_ERR(prog))
 		trace_bpf_prog_get_type(prog);
 	return prog;
 }
-EXPORT_SYMBOL_GPL(bpf_prog_get_type);
+EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev);
 
 /* last field in 'union bpf_attr' used by this command */
-#define	BPF_PROG_LOAD_LAST_FIELD prog_flags
+#define	BPF_PROG_LOAD_LAST_FIELD prog_ifindex
 
 static int bpf_prog_load(union bpf_attr *attr)
 {
@@ -1015,10 +1151,14 @@ static int bpf_prog_load(union bpf_attr *attr)
 	if (!prog)
 		return -ENOMEM;
 
-	err = bpf_prog_charge_memlock(prog);
+	err = security_bpf_prog_alloc(prog->aux);
 	if (err)
 		goto free_prog_nouncharge;
 
+	err = bpf_prog_charge_memlock(prog);
+	if (err)
+		goto free_prog_sec;
+
 	prog->len = attr->insn_cnt;
 
 	err = -EFAULT;
@@ -1032,11 +1172,22 @@ static int bpf_prog_load(union bpf_attr *attr)
 	atomic_set(&prog->aux->refcnt, 1);
 	prog->gpl_compatible = is_gpl ? 1 : 0;
 
+	if (attr->prog_ifindex) {
+		err = bpf_prog_offload_init(prog, attr);
+		if (err)
+			goto free_prog;
+	}
+
 	/* find program type: socket_filter vs tracing_filter */
 	err = find_prog_type(type, prog);
 	if (err < 0)
 		goto free_prog;
 
+	prog->aux->load_time = ktime_get_boot_ns();
+	err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name);
+	if (err)
+		goto free_prog;
+
 	/* run eBPF verifier */
 	err = bpf_check(&prog, attr);
 	if (err < 0)
@@ -1071,16 +1222,18 @@ free_used_maps:
 	free_used_maps(prog->aux);
 free_prog:
 	bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+	security_bpf_prog_free(prog->aux);
 free_prog_nouncharge:
 	bpf_prog_free(prog);
 	return err;
 }
 
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
 
 static int bpf_obj_pin(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ))
+	if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
 		return -EINVAL;
 
 	return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1088,10 +1241,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
 
 static int bpf_obj_get(const union bpf_attr *attr)
 {
-	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+	if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+	    attr->file_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+	return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+				attr->file_flags);
 }
 
 #ifdef CONFIG_CGROUP_BPF
@@ -1132,6 +1287,9 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, bool attach)
 	return 0;
 }
 
+#define BPF_F_ATTACH_MASK \
+	(BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI)
+
 static int bpf_prog_attach(const union bpf_attr *attr)
 {
 	enum bpf_prog_type ptype;
@@ -1145,7 +1303,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_ATTACH))
 		return -EINVAL;
 
-	if (attr->attach_flags & ~BPF_F_ALLOW_OVERRIDE)
+	if (attr->attach_flags & ~BPF_F_ATTACH_MASK)
 		return -EINVAL;
 
 	switch (attr->attach_type) {
@@ -1159,6 +1317,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 	case BPF_CGROUP_SOCK_OPS:
 		ptype = BPF_PROG_TYPE_SOCK_OPS;
 		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
+		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
 		return sockmap_get_from_fd(attr, true);
@@ -1176,8 +1337,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 		return PTR_ERR(cgrp);
 	}
 
-	ret = cgroup_bpf_update(cgrp, prog, attr->attach_type,
-				attr->attach_flags & BPF_F_ALLOW_OVERRIDE);
+	ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type,
+				attr->attach_flags);
 	if (ret)
 		bpf_prog_put(prog);
 	cgroup_put(cgrp);
@@ -1189,6 +1350,8 @@ static int bpf_prog_attach(const union bpf_attr *attr)
 
 static int bpf_prog_detach(const union bpf_attr *attr)
 {
+	enum bpf_prog_type ptype;
+	struct bpf_prog *prog;
 	struct cgroup *cgrp;
 	int ret;
 
@@ -1201,26 +1364,71 @@ static int bpf_prog_detach(const union bpf_attr *attr)
 	switch (attr->attach_type) {
 	case BPF_CGROUP_INET_INGRESS:
 	case BPF_CGROUP_INET_EGRESS:
+		ptype = BPF_PROG_TYPE_CGROUP_SKB;
+		break;
 	case BPF_CGROUP_INET_SOCK_CREATE:
+		ptype = BPF_PROG_TYPE_CGROUP_SOCK;
+		break;
 	case BPF_CGROUP_SOCK_OPS:
-		cgrp = cgroup_get_from_fd(attr->target_fd);
-		if (IS_ERR(cgrp))
-			return PTR_ERR(cgrp);
-
-		ret = cgroup_bpf_update(cgrp, NULL, attr->attach_type, false);
-		cgroup_put(cgrp);
+		ptype = BPF_PROG_TYPE_SOCK_OPS;
+		break;
+	case BPF_CGROUP_DEVICE:
+		ptype = BPF_PROG_TYPE_CGROUP_DEVICE;
 		break;
 	case BPF_SK_SKB_STREAM_PARSER:
 	case BPF_SK_SKB_STREAM_VERDICT:
-		ret = sockmap_get_from_fd(attr, false);
-		break;
+		return sockmap_get_from_fd(attr, false);
 	default:
 		return -EINVAL;
 	}
 
+	cgrp = cgroup_get_from_fd(attr->target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+
+	prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
+	if (IS_ERR(prog))
+		prog = NULL;
+
+	ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0);
+	if (prog)
+		bpf_prog_put(prog);
+	cgroup_put(cgrp);
 	return ret;
 }
 
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+
+static int bpf_prog_query(const union bpf_attr *attr,
+			  union bpf_attr __user *uattr)
+{
+	struct cgroup *cgrp;
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+	if (CHECK_ATTR(BPF_PROG_QUERY))
+		return -EINVAL;
+	if (attr->query.query_flags & ~BPF_F_QUERY_EFFECTIVE)
+		return -EINVAL;
+
+	switch (attr->query.attach_type) {
+	case BPF_CGROUP_INET_INGRESS:
+	case BPF_CGROUP_INET_EGRESS:
+	case BPF_CGROUP_INET_SOCK_CREATE:
+	case BPF_CGROUP_SOCK_OPS:
+	case BPF_CGROUP_DEVICE:
+		break;
+	default:
+		return -EINVAL;
+	}
+	cgrp = cgroup_get_from_fd(attr->query.target_fd);
+	if (IS_ERR(cgrp))
+		return PTR_ERR(cgrp);
+	ret = cgroup_bpf_query(cgrp, attr, uattr);
+	cgroup_put(cgrp);
+	return ret;
+}
 #endif /* CONFIG_CGROUP_BPF */
 
 #define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -1305,20 +1513,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
 
 static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 {
 	struct bpf_map *map;
 	u32 id = attr->map_id;
+	int f_flags;
 	int fd;
 
-	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+	if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	f_flags = bpf_get_file_flag(attr->open_flags);
+	if (f_flags < 0)
+		return f_flags;
+
 	spin_lock_bh(&map_idr_lock);
 	map = idr_find(&map_idr, id);
 	if (map)
@@ -1330,7 +1544,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	if (IS_ERR(map))
 		return PTR_ERR(map);
 
-	fd = bpf_map_new_fd(map);
+	fd = bpf_map_new_fd(map, f_flags);
 	if (fd < 0)
 		bpf_map_put(map);
 
@@ -1358,8 +1572,25 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 
 	info.type = prog->type;
 	info.id = prog->aux->id;
+	info.load_time = prog->aux->load_time;
+	info.created_by_uid = from_kuid_munged(current_user_ns(),
+					       prog->aux->user->uid);
 
 	memcpy(info.tag, prog->tag, sizeof(prog->tag));
+	memcpy(info.name, prog->aux->name, sizeof(prog->aux->name));
+
+	ulen = info.nr_map_ids;
+	info.nr_map_ids = prog->aux->used_map_cnt;
+	ulen = min_t(u32, info.nr_map_ids, ulen);
+	if (ulen) {
+		u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids);
+		u32 i;
+
+		for (i = 0; i < ulen; i++)
+			if (put_user(prog->aux->used_maps[i]->id,
+				     &user_map_ids[i]))
+				return -EFAULT;
+	}
 
 	if (!capable(CAP_SYS_ADMIN)) {
 		info.jited_prog_len = 0;
@@ -1413,6 +1644,7 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map,
 	info.value_size = map->value_size;
 	info.max_entries = map->max_entries;
 	info.map_flags = map->map_flags;
+	memcpy(info.name, map->name, sizeof(map->name));
 
 	if (copy_to_user(uinfo, &info, info_len) ||
 	    put_user(info_len, &uattr->info.info_len))
@@ -1467,6 +1699,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	if (copy_from_user(&attr, uattr, size) != 0)
 		return -EFAULT;
 
+	err = security_bpf(cmd, &attr, size);
+	if (err < 0)
+		return err;
+
 	switch (cmd) {
 	case BPF_MAP_CREATE:
 		err = map_create(&attr);
@@ -1499,6 +1735,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_PROG_DETACH:
 		err = bpf_prog_detach(&attr);
 		break;
+	case BPF_PROG_QUERY:
+		err = bpf_prog_query(&attr, uattr);
+		break;
 #endif
 	case BPF_PROG_TEST_RUN:
 		err = bpf_prog_test_run(&attr, uattr);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index c48ca2a34b5e..d4593571c404 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -21,6 +21,17 @@
 #include <linux/vmalloc.h>
 #include <linux/stringify.h>
 
+#include "disasm.h"
+
+static const struct bpf_verifier_ops * const bpf_verifier_ops[] = {
+#define BPF_PROG_TYPE(_id, _name) \
+	[_id] = & _name ## _verifier_ops,
+#define BPF_MAP_TYPE(_id, _ops)
+#include <linux/bpf_types.h>
+#undef BPF_PROG_TYPE
+#undef BPF_MAP_TYPE
+};
+
 /* bpf_check() is a static code analyzer that walks eBPF program
  * instruction by instruction and updates register/stack state.
  * All paths of conditional branches are analyzed until 'bpf_exit' insn.
@@ -153,28 +164,42 @@ struct bpf_call_arg_meta {
 	int access_size;
 };
 
-/* verbose verifier prints what it's seeing
- * bpf_check() is called under lock, so no race to access these global vars
- */
-static u32 log_level, log_size, log_len;
-static char *log_buf;
-
 static DEFINE_MUTEX(bpf_verifier_lock);
 
 /* log_level controls verbosity level of eBPF verifier.
  * verbose() is used to dump the verification trace to the log, so the user
  * can figure out what's wrong with the program
  */
-static __printf(1, 2) void verbose(const char *fmt, ...)
+static __printf(2, 3) void verbose(struct bpf_verifier_env *env,
+				   const char *fmt, ...)
 {
+	struct bpf_verifer_log *log = &env->log;
+	unsigned int n;
 	va_list args;
 
-	if (log_level == 0 || log_len >= log_size - 1)
+	if (!log->level || !log->ubuf || bpf_verifier_log_full(log))
 		return;
 
 	va_start(args, fmt);
-	log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+	n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args);
 	va_end(args);
+
+	WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1,
+		  "verifier log line truncated - local buffer too short\n");
+
+	n = min(log->len_total - log->len_used - 1, n);
+	log->kbuf[n] = '\0';
+
+	if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1))
+		log->len_used += n;
+	else
+		log->ubuf = NULL;
+}
+
+static bool type_is_pkt_pointer(enum bpf_reg_type type)
+{
+	return type == PTR_TO_PACKET ||
+	       type == PTR_TO_PACKET_META;
 }
 
 /* string representation of 'enum bpf_reg_type' */
@@ -187,26 +212,12 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
 	[PTR_TO_STACK]		= "fp",
 	[PTR_TO_PACKET]		= "pkt",
+	[PTR_TO_PACKET_META]	= "pkt_meta",
 	[PTR_TO_PACKET_END]	= "pkt_end",
 };
 
-#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x)
-static const char * const func_id_str[] = {
-	__BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN)
-};
-#undef __BPF_FUNC_STR_FN
-
-static const char *func_id_name(int id)
-{
-	BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID);
-
-	if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id])
-		return func_id_str[id];
-	else
-		return "unknown";
-}
-
-static void print_verifier_state(struct bpf_verifier_state *state)
+static void print_verifier_state(struct bpf_verifier_env *env,
+				 struct bpf_verifier_state *state)
 {
 	struct bpf_reg_state *reg;
 	enum bpf_reg_type t;
@@ -217,21 +228,21 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 		t = reg->type;
 		if (t == NOT_INIT)
 			continue;
-		verbose(" R%d=%s", i, reg_type_str[t]);
+		verbose(env, " R%d=%s", i, reg_type_str[t]);
 		if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
 		    tnum_is_const(reg->var_off)) {
 			/* reg->off should be 0 for SCALAR_VALUE */
-			verbose("%lld", reg->var_off.value + reg->off);
+			verbose(env, "%lld", reg->var_off.value + reg->off);
 		} else {
-			verbose("(id=%d", reg->id);
+			verbose(env, "(id=%d", reg->id);
 			if (t != SCALAR_VALUE)
-				verbose(",off=%d", reg->off);
-			if (t == PTR_TO_PACKET)
-				verbose(",r=%d", reg->range);
+				verbose(env, ",off=%d", reg->off);
+			if (type_is_pkt_pointer(t))
+				verbose(env, ",r=%d", reg->range);
 			else if (t == CONST_PTR_TO_MAP ||
 				 t == PTR_TO_MAP_VALUE ||
 				 t == PTR_TO_MAP_VALUE_OR_NULL)
-				verbose(",ks=%d,vs=%d",
+				verbose(env, ",ks=%d,vs=%d",
 					reg->map_ptr->key_size,
 					reg->map_ptr->value_size);
 			if (tnum_is_const(reg->var_off)) {
@@ -239,243 +250,174 @@ static void print_verifier_state(struct bpf_verifier_state *state)
 				 * could be a pointer whose offset is too big
 				 * for reg->off
 				 */
-				verbose(",imm=%llx", reg->var_off.value);
+				verbose(env, ",imm=%llx", reg->var_off.value);
 			} else {
 				if (reg->smin_value != reg->umin_value &&
 				    reg->smin_value != S64_MIN)
-					verbose(",smin_value=%lld",
+					verbose(env, ",smin_value=%lld",
 						(long long)reg->smin_value);
 				if (reg->smax_value != reg->umax_value &&
 				    reg->smax_value != S64_MAX)
-					verbose(",smax_value=%lld",
+					verbose(env, ",smax_value=%lld",
 						(long long)reg->smax_value);
 				if (reg->umin_value != 0)
-					verbose(",umin_value=%llu",
+					verbose(env, ",umin_value=%llu",
 						(unsigned long long)reg->umin_value);
 				if (reg->umax_value != U64_MAX)
-					verbose(",umax_value=%llu",
+					verbose(env, ",umax_value=%llu",
 						(unsigned long long)reg->umax_value);
 				if (!tnum_is_unknown(reg->var_off)) {
 					char tn_buf[48];
 
 					tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-					verbose(",var_off=%s", tn_buf);
+					verbose(env, ",var_off=%s", tn_buf);
 				}
 			}
-			verbose(")");
+			verbose(env, ")");
 		}
 	}
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] == STACK_SPILL)
-			verbose(" fp%d=%s", -MAX_BPF_STACK + i,
-				reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] == STACK_SPILL)
+			verbose(env, " fp%d=%s",
+				-MAX_BPF_STACK + i * BPF_REG_SIZE,
+				reg_type_str[state->stack[i].spilled_ptr.type]);
 	}
-	verbose("\n");
+	verbose(env, "\n");
 }
 
-static const char *const bpf_class_string[] = {
-	[BPF_LD]    = "ld",
-	[BPF_LDX]   = "ldx",
-	[BPF_ST]    = "st",
-	[BPF_STX]   = "stx",
-	[BPF_ALU]   = "alu",
-	[BPF_JMP]   = "jmp",
-	[BPF_RET]   = "BUG",
-	[BPF_ALU64] = "alu64",
-};
-
-static const char *const bpf_alu_string[16] = {
-	[BPF_ADD >> 4]  = "+=",
-	[BPF_SUB >> 4]  = "-=",
-	[BPF_MUL >> 4]  = "*=",
-	[BPF_DIV >> 4]  = "/=",
-	[BPF_OR  >> 4]  = "|=",
-	[BPF_AND >> 4]  = "&=",
-	[BPF_LSH >> 4]  = "<<=",
-	[BPF_RSH >> 4]  = ">>=",
-	[BPF_NEG >> 4]  = "neg",
-	[BPF_MOD >> 4]  = "%=",
-	[BPF_XOR >> 4]  = "^=",
-	[BPF_MOV >> 4]  = "=",
-	[BPF_ARSH >> 4] = "s>>=",
-	[BPF_END >> 4]  = "endian",
-};
-
-static const char *const bpf_ldst_string[] = {
-	[BPF_W >> 3]  = "u32",
-	[BPF_H >> 3]  = "u16",
-	[BPF_B >> 3]  = "u8",
-	[BPF_DW >> 3] = "u64",
-};
-
-static const char *const bpf_jmp_string[16] = {
-	[BPF_JA >> 4]   = "jmp",
-	[BPF_JEQ >> 4]  = "==",
-	[BPF_JGT >> 4]  = ">",
-	[BPF_JLT >> 4]  = "<",
-	[BPF_JGE >> 4]  = ">=",
-	[BPF_JLE >> 4]  = "<=",
-	[BPF_JSET >> 4] = "&",
-	[BPF_JNE >> 4]  = "!=",
-	[BPF_JSGT >> 4] = "s>",
-	[BPF_JSLT >> 4] = "s<",
-	[BPF_JSGE >> 4] = "s>=",
-	[BPF_JSLE >> 4] = "s<=",
-	[BPF_CALL >> 4] = "call",
-	[BPF_EXIT >> 4] = "exit",
-};
+static int copy_stack_state(struct bpf_verifier_state *dst,
+			    const struct bpf_verifier_state *src)
+{
+	if (!src->stack)
+		return 0;
+	if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+		/* internal bug, make state invalid to reject the program */
+		memset(dst, 0, sizeof(*dst));
+		return -EFAULT;
+	}
+	memcpy(dst->stack, src->stack,
+	       sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+	return 0;
+}
 
-static void print_bpf_insn(const struct bpf_verifier_env *env,
-			   const struct bpf_insn *insn)
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+				  bool copy_old)
 {
-	u8 class = BPF_CLASS(insn->code);
-
-	if (class == BPF_ALU || class == BPF_ALU64) {
-		if (BPF_SRC(insn->code) == BPF_X)
-			verbose("(%02x) %sr%d %s %sr%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->src_reg);
-		else
-			verbose("(%02x) %sr%d %s %s%d\n",
-				insn->code, class == BPF_ALU ? "(u32) " : "",
-				insn->dst_reg,
-				bpf_alu_string[BPF_OP(insn->code) >> 4],
-				class == BPF_ALU ? "(u32) " : "",
-				insn->imm);
-	} else if (class == BPF_STX) {
-		if (BPF_MODE(insn->code) == BPF_MEM)
-			verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg,
-				insn->off, insn->src_reg);
-		else if (BPF_MODE(insn->code) == BPF_XADD)
-			verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->dst_reg, insn->off,
-				insn->src_reg);
-		else
-			verbose("BUG_%02x\n", insn->code);
-	} else if (class == BPF_ST) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_st_%02x\n", insn->code);
-			return;
-		}
-		verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
-			insn->code,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->dst_reg,
-			insn->off, insn->imm);
-	} else if (class == BPF_LDX) {
-		if (BPF_MODE(insn->code) != BPF_MEM) {
-			verbose("BUG_ldx_%02x\n", insn->code);
-			return;
+	u32 old_size = state->allocated_stack;
+	struct bpf_stack_state *new_stack;
+	int slot = size / BPF_REG_SIZE;
+
+	if (size <= old_size || !size) {
+		if (copy_old)
+			return 0;
+		state->allocated_stack = slot * BPF_REG_SIZE;
+		if (!size && old_size) {
+			kfree(state->stack);
+			state->stack = NULL;
 		}
-		verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
-			insn->code, insn->dst_reg,
-			bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-			insn->src_reg, insn->off);
-	} else if (class == BPF_LD) {
-		if (BPF_MODE(insn->code) == BPF_ABS) {
-			verbose("(%02x) r0 = *(%s *)skb[%d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IND) {
-			verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
-				insn->code,
-				bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
-				insn->src_reg, insn->imm);
-		} else if (BPF_MODE(insn->code) == BPF_IMM &&
-			   BPF_SIZE(insn->code) == BPF_DW) {
-			/* At this point, we already made sure that the second
-			 * part of the ldimm64 insn is accessible.
-			 */
-			u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
-			bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+		return 0;
+	}
+	new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+				  GFP_KERNEL);
+	if (!new_stack)
+		return -ENOMEM;
+	if (copy_old) {
+		if (state->stack)
+			memcpy(new_stack, state->stack,
+			       sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+		memset(new_stack + old_size / BPF_REG_SIZE, 0,
+		       sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+	}
+	state->allocated_stack = slot * BPF_REG_SIZE;
+	kfree(state->stack);
+	state->stack = new_stack;
+	return 0;
+}
 
-			if (map_ptr && !env->allow_ptr_leaks)
-				imm = 0;
+static void free_verifier_state(struct bpf_verifier_state *state,
+				bool free_self)
+{
+	kfree(state->stack);
+	if (free_self)
+		kfree(state);
+}
 
-			verbose("(%02x) r%d = 0x%llx\n", insn->code,
-				insn->dst_reg, (unsigned long long)imm);
-		} else {
-			verbose("BUG_ld_%02x\n", insn->code);
-			return;
-		}
-	} else if (class == BPF_JMP) {
-		u8 opcode = BPF_OP(insn->code);
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+			       const struct bpf_verifier_state *src)
+{
+	int err;
 
-		if (opcode == BPF_CALL) {
-			verbose("(%02x) call %s#%d\n", insn->code,
-				func_id_name(insn->imm), insn->imm);
-		} else if (insn->code == (BPF_JMP | BPF_JA)) {
-			verbose("(%02x) goto pc%+d\n",
-				insn->code, insn->off);
-		} else if (insn->code == (BPF_JMP | BPF_EXIT)) {
-			verbose("(%02x) exit\n", insn->code);
-		} else if (BPF_SRC(insn->code) == BPF_X) {
-			verbose("(%02x) if r%d %s r%d goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->src_reg, insn->off);
-		} else {
-			verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
-				insn->code, insn->dst_reg,
-				bpf_jmp_string[BPF_OP(insn->code) >> 4],
-				insn->imm, insn->off);
-		}
-	} else {
-		verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
-	}
+	err = realloc_verifier_state(dst, src->allocated_stack, false);
+	if (err)
+		return err;
+	memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+	return copy_stack_state(dst, src);
 }
 
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+		     int *insn_idx)
 {
-	struct bpf_verifier_stack_elem *elem;
-	int insn_idx;
+	struct bpf_verifier_state *cur = env->cur_state;
+	struct bpf_verifier_stack_elem *elem, *head = env->head;
+	int err;
 
 	if (env->head == NULL)
-		return -1;
+		return -ENOENT;
 
-	memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
-	insn_idx = env->head->insn_idx;
+	if (cur) {
+		err = copy_verifier_state(cur, &head->st);
+		if (err)
+			return err;
+	}
+	if (insn_idx)
+		*insn_idx = head->insn_idx;
 	if (prev_insn_idx)
-		*prev_insn_idx = env->head->prev_insn_idx;
-	elem = env->head->next;
-	kfree(env->head);
+		*prev_insn_idx = head->prev_insn_idx;
+	elem = head->next;
+	free_verifier_state(&head->st, false);
+	kfree(head);
 	env->head = elem;
 	env->stack_size--;
-	return insn_idx;
+	return 0;
 }
 
 static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
 					     int insn_idx, int prev_insn_idx)
 {
+	struct bpf_verifier_state *cur = env->cur_state;
 	struct bpf_verifier_stack_elem *elem;
+	int err;
 
-	elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+	elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
 	if (!elem)
 		goto err;
 
-	memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
 	elem->insn_idx = insn_idx;
 	elem->prev_insn_idx = prev_insn_idx;
 	elem->next = env->head;
 	env->head = elem;
 	env->stack_size++;
+	err = copy_verifier_state(&elem->st, cur);
+	if (err)
+		goto err;
 	if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
-		verbose("BPF program is too complex\n");
+		verbose(env, "BPF program is too complex\n");
 		goto err;
 	}
 	return &elem->st;
 err:
 	/* pop all elements and return */
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	return NULL;
 }
 
@@ -507,10 +449,11 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg)
 	__mark_reg_known(reg, 0);
 }
 
-static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_known_zero(struct bpf_verifier_env *env,
+				struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_known_zero(regs, %u)\n", regno);
+		verbose(env, "mark_reg_known_zero(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -519,6 +462,31 @@ static void mark_reg_known_zero(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_known_zero(regs + regno);
 }
 
+static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+{
+	return type_is_pkt_pointer(reg->type);
+}
+
+static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg)
+{
+	return reg_is_pkt_pointer(reg) ||
+	       reg->type == PTR_TO_PACKET_END;
+}
+
+/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */
+static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg,
+				    enum bpf_reg_type which)
+{
+	/* The register can already have a range from prior markings.
+	 * This is fine as long as it hasn't been advanced from its
+	 * origin.
+	 */
+	return reg->type == which &&
+	       reg->id == 0 &&
+	       reg->off == 0 &&
+	       tnum_equals_const(reg->var_off, 0);
+}
+
 /* Attempts to improve min/max values based on var_off information */
 static void __update_reg_bounds(struct bpf_reg_state *reg)
 {
@@ -595,10 +563,11 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg)
 	__mark_reg_unbounded(reg);
 }
 
-static void mark_reg_unknown(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_unknown(struct bpf_verifier_env *env,
+			     struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_unknown(regs, %u)\n", regno);
+		verbose(env, "mark_reg_unknown(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -613,10 +582,11 @@ static void __mark_reg_not_init(struct bpf_reg_state *reg)
 	reg->type = NOT_INIT;
 }
 
-static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
+static void mark_reg_not_init(struct bpf_verifier_env *env,
+			      struct bpf_reg_state *regs, u32 regno)
 {
 	if (WARN_ON(regno >= MAX_BPF_REG)) {
-		verbose("mark_reg_not_init(regs, %u)\n", regno);
+		verbose(env, "mark_reg_not_init(regs, %u)\n", regno);
 		/* Something bad happened, let's kill all regs */
 		for (regno = 0; regno < MAX_BPF_REG; regno++)
 			__mark_reg_not_init(regs + regno);
@@ -625,22 +595,23 @@ static void mark_reg_not_init(struct bpf_reg_state *regs, u32 regno)
 	__mark_reg_not_init(regs + regno);
 }
 
-static void init_reg_state(struct bpf_reg_state *regs)
+static void init_reg_state(struct bpf_verifier_env *env,
+			   struct bpf_reg_state *regs)
 {
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++) {
-		mark_reg_not_init(regs, i);
+		mark_reg_not_init(env, regs, i);
 		regs[i].live = REG_LIVE_NONE;
 	}
 
 	/* frame pointer */
 	regs[BPF_REG_FP].type = PTR_TO_STACK;
-	mark_reg_known_zero(regs, BPF_REG_FP);
+	mark_reg_known_zero(env, regs, BPF_REG_FP);
 
 	/* 1st arg to a function */
 	regs[BPF_REG_1].type = PTR_TO_CTX;
-	mark_reg_known_zero(regs, BPF_REG_1);
+	mark_reg_known_zero(env, regs, BPF_REG_1);
 }
 
 enum reg_arg_type {
@@ -671,29 +642,29 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
 static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 			 enum reg_arg_type t)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = env->cur_state->regs;
 
 	if (regno >= MAX_BPF_REG) {
-		verbose("R%d is invalid\n", regno);
+		verbose(env, "R%d is invalid\n", regno);
 		return -EINVAL;
 	}
 
 	if (t == SRC_OP) {
 		/* check whether register used as source operand can be read */
 		if (regs[regno].type == NOT_INIT) {
-			verbose("R%d !read_ok\n", regno);
+			verbose(env, "R%d !read_ok\n", regno);
 			return -EACCES;
 		}
-		mark_reg_read(&env->cur_state, regno);
+		mark_reg_read(env->cur_state, regno);
 	} else {
 		/* check whether register used as dest operand can be written to */
 		if (regno == BPF_REG_FP) {
-			verbose("frame pointer is read only\n");
+			verbose(env, "frame pointer is read only\n");
 			return -EACCES;
 		}
 		regs[regno].live |= REG_LIVE_WRITTEN;
 		if (t == DST_OP)
-			mark_reg_unknown(regs, regno);
+			mark_reg_unknown(env, regs, regno);
 	}
 	return 0;
 }
@@ -706,6 +677,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_STACK:
 	case PTR_TO_CTX:
 	case PTR_TO_PACKET:
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET_END:
 	case CONST_PTR_TO_MAP:
 		return true;
@@ -717,35 +689,48 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 /* check_stack_read/write functions track spill/fill of registers,
  * stack boundary and alignment are checked in check_mem_access()
  */
-static int check_stack_write(struct bpf_verifier_state *state, int off,
+static int check_stack_write(struct bpf_verifier_env *env,
+			     struct bpf_verifier_state *state, int off,
 			     int size, int value_regno)
 {
-	int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+	err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+				     true);
+	if (err)
+		return err;
 	/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
 	 * so it's aligned access and [off, off + size) are within stack limits
 	 */
+	if (!env->allow_ptr_leaks &&
+	    state->stack[spi].slot_type[0] == STACK_SPILL &&
+	    size != BPF_REG_SIZE) {
+		verbose(env, "attempt to corrupt spilled pointer on stack\n");
+		return -EACCES;
+	}
 
 	if (value_regno >= 0 &&
 	    is_spillable_regtype(state->regs[value_regno].type)) {
 
 		/* register containing pointer is being spilled into stack */
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 
 		/* save register state */
-		state->spilled_regs[spi] = state->regs[value_regno];
-		state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+		state->stack[spi].spilled_ptr = state->regs[value_regno];
+		state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
 
 		for (i = 0; i < BPF_REG_SIZE; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+			state->stack[spi].slot_type[i] = STACK_SPILL;
 	} else {
 		/* regular write of data into stack */
-		state->spilled_regs[spi] = (struct bpf_reg_state) {};
+		state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
 
 		for (i = 0; i < size; i++)
-			state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+			state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+				STACK_MISC;
 	}
 	return 0;
 }
@@ -756,66 +741,72 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
 
 	while (parent) {
 		/* if read wasn't screened by an earlier write ... */
-		if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+		if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
 			break;
 		/* ... then we depend on parent's value */
-		parent->spilled_regs[slot].live |= REG_LIVE_READ;
+		parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
 		state = parent;
 		parent = state->parent;
 	}
 }
 
-static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
+static int check_stack_read(struct bpf_verifier_env *env,
+			    struct bpf_verifier_state *state, int off, int size,
 			    int value_regno)
 {
-	u8 *slot_type;
-	int i, spi;
+	int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+	u8 *stype;
 
-	slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+	if (state->allocated_stack <= slot) {
+		verbose(env, "invalid read from stack off %d+0 size %d\n",
+			off, size);
+		return -EACCES;
+	}
+	stype = state->stack[spi].slot_type;
 
-	if (slot_type[0] == STACK_SPILL) {
+	if (stype[0] == STACK_SPILL) {
 		if (size != BPF_REG_SIZE) {
-			verbose("invalid size of register spill\n");
+			verbose(env, "invalid size of register spill\n");
 			return -EACCES;
 		}
 		for (i = 1; i < BPF_REG_SIZE; i++) {
-			if (slot_type[i] != STACK_SPILL) {
-				verbose("corrupted spill memory\n");
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
+				verbose(env, "corrupted spill memory\n");
 				return -EACCES;
 			}
 		}
 
-		spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
 		if (value_regno >= 0) {
 			/* restore register state from stack */
-			state->regs[value_regno] = state->spilled_regs[spi];
+			state->regs[value_regno] = state->stack[spi].spilled_ptr;
 			mark_stack_slot_read(state, spi);
 		}
 		return 0;
 	} else {
 		for (i = 0; i < size; i++) {
-			if (slot_type[i] != STACK_MISC) {
-				verbose("invalid read from stack off %d+%d size %d\n",
+			if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
+				verbose(env, "invalid read from stack off %d+%d size %d\n",
 					off, i, size);
 				return -EACCES;
 			}
 		}
 		if (value_regno >= 0)
 			/* have read misc data from the stack */
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, state->regs, value_regno);
 		return 0;
 	}
 }
 
 /* check read/write into map element returned by bpf_map_lookup_elem() */
 static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
-			    int size)
+			      int size, bool zero_size_allowed)
 {
-	struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_map *map = regs[regno].map_ptr;
 
-	if (off < 0 || size <= 0 || off + size > map->value_size) {
-		verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+	    off + size > map->value_size) {
+		verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
 			map->value_size, off, size);
 		return -EACCES;
 	}
@@ -824,9 +815,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
 
 /* check read/write into a map element with possible variable offset */
 static int check_map_access(struct bpf_verifier_env *env, u32 regno,
-				int off, int size)
+			    int off, int size, bool zero_size_allowed)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *reg = &state->regs[regno];
 	int err;
 
@@ -834,8 +825,8 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * need to try adding each of min_value and max_value to off
 	 * to make sure our theoretical access will be safe.
 	 */
-	if (log_level)
-		print_verifier_state(state);
+	if (env->log.level)
+		print_verifier_state(env, state);
 	/* The minimum value is only important with signed
 	 * comparisons where we can't assume the floor of a
 	 * value is 0.  If we are using signed variables for our
@@ -843,13 +834,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * will have a set floor within our range.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->smin_value + off, size);
+	err = __check_map_access(env, regno, reg->smin_value + off, size,
+				 zero_size_allowed);
 	if (err) {
-		verbose("R%d min value is outside of the array range\n", regno);
+		verbose(env, "R%d min value is outside of the array range\n",
+			regno);
 		return err;
 	}
 
@@ -858,13 +851,15 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
 	 * If reg->umax_value + off could overflow, treat that as unbounded too.
 	 */
 	if (reg->umax_value >= BPF_MAX_VAR_OFF) {
-		verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
+		verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n",
 			regno);
 		return -EACCES;
 	}
-	err = __check_map_access(env, regno, reg->umax_value + off, size);
+	err = __check_map_access(env, regno, reg->umax_value + off, size,
+				 zero_size_allowed);
 	if (err)
-		verbose("R%d max value is outside of the array range\n", regno);
+		verbose(env, "R%d max value is outside of the array range\n",
+			regno);
 	return err;
 }
 
@@ -897,13 +892,14 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
 }
 
 static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
-				 int off, int size)
+				 int off, int size, bool zero_size_allowed)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 
-	if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
-		verbose("invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
+	if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) ||
+	    (u64)off + size > reg->range) {
+		verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n",
 			off, size, regno, reg->id, reg->off, reg->range);
 		return -EACCES;
 	}
@@ -911,9 +907,9 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
 }
 
 static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
-			       int size)
+			       int size, bool zero_size_allowed)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	struct bpf_reg_state *reg = &regs[regno];
 	int err;
 
@@ -926,13 +922,13 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
 	 * detail to prove they're safe.
 	 */
 	if (reg->smin_value < 0) {
-		verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
+		verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
 			regno);
 		return -EACCES;
 	}
-	err = __check_packet_access(env, regno, off, size);
+	err = __check_packet_access(env, regno, off, size, zero_size_allowed);
 	if (err) {
-		verbose("R%d offset is outside of the packet\n", regno);
+		verbose(env, "R%d offset is outside of the packet\n", regno);
 		return err;
 	}
 	return err;
@@ -946,12 +942,8 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		.reg_type = *reg_type,
 	};
 
-	/* for analyzer ctx accesses are already validated and converted */
-	if (env->analyzer_ops)
-		return 0;
-
-	if (env->prog->aux->ops->is_valid_access &&
-	    env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+	if (env->ops->is_valid_access &&
+	    env->ops->is_valid_access(off, size, t, &info)) {
 		/* A non zero info.ctx_field_size indicates that this field is a
 		 * candidate for later verifier transformation to load the whole
 		 * field and then apply a mask when accessed with a narrower
@@ -959,16 +951,16 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off,
 		 * will only allow for whole field access and rejects any other
 		 * type of narrower access.
 		 */
-		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		*reg_type = info.reg_type;
 
+		env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
 		/* remember the offset of last byte accessed in ctx */
 		if (env->prog->aux->max_ctx_offset < off + size)
 			env->prog->aux->max_ctx_offset = off + size;
 		return 0;
 	}
 
-	verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+	verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size);
 	return -EACCES;
 }
 
@@ -983,10 +975,11 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
 
 static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
 {
-	return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+	return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
 }
 
-static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_pkt_ptr_alignment(struct bpf_verifier_env *env,
+				   const struct bpf_reg_state *reg,
 				   int off, int size, bool strict)
 {
 	struct tnum reg_off;
@@ -1011,7 +1004,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned packet access off %d+%s+%d+%d size %d\n",
+		verbose(env,
+			"misaligned packet access off %d+%s+%d+%d size %d\n",
 			ip_align, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1019,7 +1013,8 @@ static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
 	return 0;
 }
 
-static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
+static int check_generic_ptr_alignment(struct bpf_verifier_env *env,
+				       const struct bpf_reg_state *reg,
 				       const char *pointer_desc,
 				       int off, int size, bool strict)
 {
@@ -1034,7 +1029,7 @@ static int check_generic_ptr_alignment(const struct bpf_reg_state *reg,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-		verbose("misaligned %saccess off %s+%d+%d size %d\n",
+		verbose(env, "misaligned %saccess off %s+%d+%d size %d\n",
 			pointer_desc, tn_buf, reg->off, off, size);
 		return -EACCES;
 	}
@@ -1051,8 +1046,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		/* special case, because of NET_IP_ALIGN */
-		return check_pkt_ptr_alignment(reg, off, size, strict);
+	case PTR_TO_PACKET_META:
+		/* Special case, because of NET_IP_ALIGN. Given metadata sits
+		 * right in front, treat it the very same way.
+		 */
+		return check_pkt_ptr_alignment(env, reg, off, size, strict);
 	case PTR_TO_MAP_VALUE:
 		pointer_desc = "value ";
 		break;
@@ -1065,7 +1063,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
 	default:
 		break;
 	}
-	return check_generic_ptr_alignment(reg, pointer_desc, off, size, strict);
+	return check_generic_ptr_alignment(env, reg, pointer_desc, off, size,
+					   strict);
 }
 
 /* check whether memory at (regno + off) is accessible for t = (read | write)
@@ -1078,8 +1077,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			    int bpf_size, enum bpf_access_type t,
 			    int value_regno)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
-	struct bpf_reg_state *reg = &state->regs[regno];
+	struct bpf_verifier_state *state = env->cur_state;
+	struct bpf_reg_state *regs = cur_regs(env);
+	struct bpf_reg_state *reg = regs + regno;
 	int size, err = 0;
 
 	size = bpf_size_to_bytes(bpf_size);
@@ -1097,27 +1097,28 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	if (reg->type == PTR_TO_MAP_VALUE) {
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into map\n", value_regno);
+			verbose(env, "R%d leaks addr into map\n", value_regno);
 			return -EACCES;
 		}
 
-		err = check_map_access(env, regno, off, size);
+		err = check_map_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 
 	} else if (reg->type == PTR_TO_CTX) {
 		enum bpf_reg_type reg_type = SCALAR_VALUE;
 
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into ctx\n", value_regno);
+			verbose(env, "R%d leaks addr into ctx\n", value_regno);
 			return -EACCES;
 		}
 		/* ctx accesses must be at a fixed offset, so that we can
 		 * determine what type of data were returned.
 		 */
 		if (reg->off) {
-			verbose("dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
+			verbose(env,
+				"dereference of modified ctx ptr R%d off=%d+%d, ctx+const is allowed, ctx+const+const is not\n",
 				regno, reg->off, off - reg->off);
 			return -EACCES;
 		}
@@ -1125,24 +1126,26 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable ctx access var_off=%s off=%d size=%d",
+			verbose(env,
+				"variable ctx access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
 		err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
 		if (!err && t == BPF_READ && value_regno >= 0) {
 			/* ctx access returns either a scalar, or a
-			 * PTR_TO_PACKET[_END].  In the latter case, we know
-			 * the offset is zero.
+			 * PTR_TO_PACKET[_META,_END]. In the latter
+			 * case, we know the offset is zero.
 			 */
 			if (reg_type == SCALAR_VALUE)
-				mark_reg_unknown(state->regs, value_regno);
+				mark_reg_unknown(env, regs, value_regno);
 			else
-				mark_reg_known_zero(state->regs, value_regno);
-			state->regs[value_regno].id = 0;
-			state->regs[value_regno].off = 0;
-			state->regs[value_regno].range = 0;
-			state->regs[value_regno].type = reg_type;
+				mark_reg_known_zero(env, regs,
+						    value_regno);
+			regs[value_regno].id = 0;
+			regs[value_regno].off = 0;
+			regs[value_regno].range = 0;
+			regs[value_regno].type = reg_type;
 		}
 
 	} else if (reg->type == PTR_TO_STACK) {
@@ -1154,55 +1157,52 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 			char tn_buf[48];
 
 			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
-			verbose("variable stack access var_off=%s off=%d size=%d",
+			verbose(env, "variable stack access var_off=%s off=%d size=%d",
 				tn_buf, off, size);
 			return -EACCES;
 		}
 		off += reg->var_off.value;
 		if (off >= 0 || off < -MAX_BPF_STACK) {
-			verbose("invalid stack off=%d size=%d\n", off, size);
+			verbose(env, "invalid stack off=%d size=%d\n", off,
+				size);
 			return -EACCES;
 		}
 
 		if (env->prog->aux->stack_depth < -off)
 			env->prog->aux->stack_depth = -off;
 
-		if (t == BPF_WRITE) {
-			if (!env->allow_ptr_leaks &&
-			    state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
-			    size != BPF_REG_SIZE) {
-				verbose("attempt to corrupt spilled pointer on stack\n");
-				return -EACCES;
-			}
-			err = check_stack_write(state, off, size, value_regno);
-		} else {
-			err = check_stack_read(state, off, size, value_regno);
-		}
-	} else if (reg->type == PTR_TO_PACKET) {
+		if (t == BPF_WRITE)
+			err = check_stack_write(env, state, off, size,
+						value_regno);
+		else
+			err = check_stack_read(env, state, off, size,
+					       value_regno);
+	} else if (reg_is_pkt_pointer(reg)) {
 		if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
-			verbose("cannot write into packet\n");
+			verbose(env, "cannot write into packet\n");
 			return -EACCES;
 		}
 		if (t == BPF_WRITE && value_regno >= 0 &&
 		    is_pointer_value(env, value_regno)) {
-			verbose("R%d leaks addr into packet\n", value_regno);
+			verbose(env, "R%d leaks addr into packet\n",
+				value_regno);
 			return -EACCES;
 		}
-		err = check_packet_access(env, regno, off, size);
+		err = check_packet_access(env, regno, off, size, false);
 		if (!err && t == BPF_READ && value_regno >= 0)
-			mark_reg_unknown(state->regs, value_regno);
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
-		verbose("R%d invalid mem access '%s'\n",
-			regno, reg_type_str[reg->type]);
+		verbose(env, "R%d invalid mem access '%s'\n", regno,
+			reg_type_str[reg->type]);
 		return -EACCES;
 	}
 
 	if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
-	    state->regs[value_regno].type == SCALAR_VALUE) {
+	    regs[value_regno].type == SCALAR_VALUE) {
 		/* b/h/w load zero-extends, mark upper bits as known 0 */
-		state->regs[value_regno].var_off = tnum_cast(
-					state->regs[value_regno].var_off, size);
-		__update_reg_bounds(&state->regs[value_regno]);
+		regs[value_regno].var_off =
+			tnum_cast(regs[value_regno].var_off, size);
+		__update_reg_bounds(&regs[value_regno]);
 	}
 	return err;
 }
@@ -1213,7 +1213,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 
 	if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
 	    insn->imm != 0) {
-		verbose("BPF_XADD uses reserved fields\n");
+		verbose(env, "BPF_XADD uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -1228,7 +1228,7 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins
 		return err;
 
 	if (is_pointer_value(env, insn->src_reg)) {
-		verbose("R%d leaks addr into mem\n", insn->src_reg);
+		verbose(env, "R%d leaks addr into mem\n", insn->src_reg);
 		return -EACCES;
 	}
 
@@ -1259,9 +1259,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 				int access_size, bool zero_size_allowed,
 				struct bpf_call_arg_meta *meta)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs;
-	int off, i;
+	int off, i, slot, spi;
 
 	if (regs[regno].type != PTR_TO_STACK) {
 		/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1269,7 +1269,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		    register_is_null(regs[regno]))
 			return 0;
 
-		verbose("R%d type=%s expected=%s\n", regno,
+		verbose(env, "R%d type=%s expected=%s\n", regno,
 			reg_type_str[regs[regno].type],
 			reg_type_str[PTR_TO_STACK]);
 		return -EACCES;
@@ -1280,13 +1280,13 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 		char tn_buf[48];
 
 		tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off);
-		verbose("invalid variable stack read R%d var_off=%s\n",
+		verbose(env, "invalid variable stack read R%d var_off=%s\n",
 			regno, tn_buf);
 	}
 	off = regs[regno].off + regs[regno].var_off.value;
 	if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
-	    access_size <= 0) {
-		verbose("invalid stack type R%d off=%d access_size=%d\n",
+	    access_size < 0 || (access_size == 0 && !zero_size_allowed)) {
+		verbose(env, "invalid stack type R%d off=%d access_size=%d\n",
 			regno, off, access_size);
 		return -EACCES;
 	}
@@ -1301,8 +1301,12 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
 	}
 
 	for (i = 0; i < access_size; i++) {
-		if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
-			verbose("invalid indirect read from stack off %d+%d size %d\n",
+		slot = -(off + i) - 1;
+		spi = slot / BPF_REG_SIZE;
+		if (state->allocated_stack <= slot ||
+		    state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+			STACK_MISC) {
+			verbose(env, "invalid indirect read from stack off %d+%d size %d\n",
 				off, i, access_size);
 			return -EACCES;
 		}
@@ -1314,13 +1318,16 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 				   int access_size, bool zero_size_allowed,
 				   struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 
 	switch (reg->type) {
 	case PTR_TO_PACKET:
-		return check_packet_access(env, regno, reg->off, access_size);
+	case PTR_TO_PACKET_META:
+		return check_packet_access(env, regno, reg->off, access_size,
+					   zero_size_allowed);
 	case PTR_TO_MAP_VALUE:
-		return check_map_access(env, regno, reg->off, access_size);
+		return check_map_access(env, regno, reg->off, access_size,
+					zero_size_allowed);
 	default: /* scalar_value|ptr_to_stack or invalid ptr */
 		return check_stack_boundary(env, regno, access_size,
 					    zero_size_allowed, meta);
@@ -1331,7 +1338,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			  enum bpf_arg_type arg_type,
 			  struct bpf_call_arg_meta *meta)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+	struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
 	enum bpf_reg_type expected_type, type = reg->type;
 	int err = 0;
 
@@ -1344,22 +1351,24 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	if (arg_type == ARG_ANYTHING) {
 		if (is_pointer_value(env, regno)) {
-			verbose("R%d leaks addr into helper function\n", regno);
+			verbose(env, "R%d leaks addr into helper function\n",
+				regno);
 			return -EACCES;
 		}
 		return 0;
 	}
 
-	if (type == PTR_TO_PACKET &&
+	if (type_is_pkt_pointer(type) &&
 	    !may_access_direct_pkt_data(env, meta, BPF_READ)) {
-		verbose("helper access to the packet is not allowed\n");
+		verbose(env, "helper access to the packet is not allowed\n");
 		return -EACCES;
 	}
 
 	if (arg_type == ARG_PTR_TO_MAP_KEY ||
 	    arg_type == ARG_PTR_TO_MAP_VALUE) {
 		expected_type = PTR_TO_STACK;
-		if (type != PTR_TO_PACKET && type != expected_type)
+		if (!type_is_pkt_pointer(type) &&
+		    type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_CONST_SIZE ||
 		   arg_type == ARG_CONST_SIZE_OR_ZERO) {
@@ -1375,20 +1384,23 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		if (type != expected_type)
 			goto err_type;
 	} else if (arg_type == ARG_PTR_TO_MEM ||
+		   arg_type == ARG_PTR_TO_MEM_OR_NULL ||
 		   arg_type == ARG_PTR_TO_UNINIT_MEM) {
 		expected_type = PTR_TO_STACK;
 		/* One exception here. In case function allows for NULL to be
 		 * passed in as argument, it's a SCALAR_VALUE type. Final test
 		 * happens during stack boundary checking.
 		 */
-		if (register_is_null(*reg))
+		if (register_is_null(*reg) &&
+		    arg_type == ARG_PTR_TO_MEM_OR_NULL)
 			/* final test in check_stack_boundary() */;
-		else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
+		else if (!type_is_pkt_pointer(type) &&
+			 type != PTR_TO_MAP_VALUE &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
 	} else {
-		verbose("unsupported arg_type %d\n", arg_type);
+		verbose(env, "unsupported arg_type %d\n", arg_type);
 		return -EFAULT;
 	}
 
@@ -1406,12 +1418,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			 * we have to check map_key here. Otherwise it means
 			 * that kernel subsystem misconfigured verifier
 			 */
-			verbose("invalid map_ptr to access map->key\n");
+			verbose(env, "invalid map_ptr to access map->key\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->key_size);
+						  meta->map_ptr->key_size,
+						  false);
 		else
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->key_size,
@@ -1422,12 +1435,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (!meta->map_ptr) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("invalid map_ptr to access map->value\n");
+			verbose(env, "invalid map_ptr to access map->value\n");
 			return -EACCES;
 		}
-		if (type == PTR_TO_PACKET)
+		if (type_is_pkt_pointer(type))
 			err = check_packet_access(env, regno, reg->off,
-						  meta->map_ptr->value_size);
+						  meta->map_ptr->value_size,
+						  false);
 		else
 			err = check_stack_boundary(env, regno,
 						   meta->map_ptr->value_size,
@@ -1442,7 +1456,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		 */
 		if (regno == 0) {
 			/* kernel subsystem misconfigured verifier */
-			verbose("ARG_CONST_SIZE cannot be first argument\n");
+			verbose(env,
+				"ARG_CONST_SIZE cannot be first argument\n");
 			return -EACCES;
 		}
 
@@ -1459,7 +1474,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 			meta = NULL;
 
 		if (reg->smin_value < 0) {
-			verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
+			verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1473,7 +1488,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 		}
 
 		if (reg->umax_value >= BPF_MAX_VAR_SIZ) {
-			verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
+			verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
 				regno);
 			return -EACCES;
 		}
@@ -1484,12 +1499,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
 
 	return err;
 err_type:
-	verbose("R%d type=%s expected=%s\n", regno,
+	verbose(env, "R%d type=%s expected=%s\n", regno,
 		reg_type_str[type], reg_type_str[expected_type]);
 	return -EACCES;
 }
 
-static int check_map_func_compatibility(struct bpf_map *map, int func_id)
+static int check_map_func_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map, int func_id)
 {
 	if (!map)
 		return 0;
@@ -1502,7 +1518,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_MAP_TYPE_PERF_EVENT_ARRAY:
 		if (func_id != BPF_FUNC_perf_event_read &&
-		    func_id != BPF_FUNC_perf_event_output)
+		    func_id != BPF_FUNC_perf_event_output &&
+		    func_id != BPF_FUNC_perf_event_read_value)
 			goto error;
 		break;
 	case BPF_MAP_TYPE_STACK_TRACE:
@@ -1522,6 +1539,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		if (func_id != BPF_FUNC_redirect_map)
 			goto error;
 		break;
+	/* Restrict bpf side of cpumap, open when use-cases appear */
+	case BPF_MAP_TYPE_CPUMAP:
+		if (func_id != BPF_FUNC_redirect_map)
+			goto error;
+		break;
 	case BPF_MAP_TYPE_ARRAY_OF_MAPS:
 	case BPF_MAP_TYPE_HASH_OF_MAPS:
 		if (func_id != BPF_FUNC_map_lookup_elem)
@@ -1545,6 +1567,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		break;
 	case BPF_FUNC_perf_event_read:
 	case BPF_FUNC_perf_event_output:
+	case BPF_FUNC_perf_event_read_value:
 		if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
 			goto error;
 		break;
@@ -1558,7 +1581,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 			goto error;
 		break;
 	case BPF_FUNC_redirect_map:
-		if (map->map_type != BPF_MAP_TYPE_DEVMAP)
+		if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
+		    map->map_type != BPF_MAP_TYPE_CPUMAP)
 			goto error;
 		break;
 	case BPF_FUNC_sk_redirect_map:
@@ -1575,7 +1599,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 
 	return 0;
 error:
-	verbose("cannot pass map_type %d into func %s#%d\n",
+	verbose(env, "cannot pass map_type %d into func %s#%d\n",
 		map->map_type, func_id_name(func_id), func_id);
 	return -EINVAL;
 }
@@ -1598,57 +1622,55 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
 	return count > 1 ? -EINVAL : 0;
 }
 
-/* Packet data might have moved, any old PTR_TO_PACKET[_END] are now invalid,
- * so turn them into unknown SCALAR_VALUE.
+/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END]
+ * are now invalid, so turn them into unknown SCALAR_VALUE.
  */
 static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state = env->cur_state;
 	struct bpf_reg_state *regs = state->regs, *reg;
 	int i;
 
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET ||
-		    regs[i].type == PTR_TO_PACKET_END)
-			mark_reg_unknown(regs, i);
+		if (reg_is_pkt_pointer_any(&regs[i]))
+			mark_reg_unknown(env, regs, i);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
-			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type != PTR_TO_PACKET &&
-		    reg->type != PTR_TO_PACKET_END)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		__mark_reg_unknown(reg);
+		reg = &state->stack[i].spilled_ptr;
+		if (reg_is_pkt_pointer_any(reg))
+			__mark_reg_unknown(reg);
 	}
 }
 
 static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
 	const struct bpf_func_proto *fn = NULL;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	struct bpf_call_arg_meta meta;
 	bool changes_data;
 	int i, err;
 
 	/* find function prototype */
 	if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
-		verbose("invalid func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
-	if (env->prog->aux->ops->get_func_proto)
-		fn = env->prog->aux->ops->get_func_proto(func_id);
+	if (env->ops->get_func_proto)
+		fn = env->ops->get_func_proto(func_id);
 
 	if (!fn) {
-		verbose("unknown func %s#%d\n", func_id_name(func_id), func_id);
+		verbose(env, "unknown func %s#%d\n", func_id_name(func_id),
+			func_id);
 		return -EINVAL;
 	}
 
 	/* eBPF programs must be GPL compatible to use GPL-ed functions */
 	if (!env->prog->gpl_compatible && fn->gpl_only) {
-		verbose("cannot call GPL only function from proprietary program\n");
+		verbose(env, "cannot call GPL only function from proprietary program\n");
 		return -EINVAL;
 	}
 
@@ -1662,7 +1684,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 	 */
 	err = check_raw_mode(fn);
 	if (err) {
-		verbose("kernel subsystem misconfigured func %s#%d\n",
+		verbose(env, "kernel subsystem misconfigured func %s#%d\n",
 			func_id_name(func_id), func_id);
 		return err;
 	}
@@ -1693,16 +1715,17 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 			return err;
 	}
 
+	regs = cur_regs(env);
 	/* reset caller saved regs */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
 	/* update return register (already marked as written above) */
 	if (fn->ret_type == RET_INTEGER) {
 		/* sets type to SCALAR_VALUE */
-		mark_reg_unknown(regs, BPF_REG_0);
+		mark_reg_unknown(env, regs, BPF_REG_0);
 	} else if (fn->ret_type == RET_VOID) {
 		regs[BPF_REG_0].type = NOT_INIT;
 	} else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
@@ -1710,14 +1733,15 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 
 		regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
 		/* There is no offset yet applied, variable or fixed */
-		mark_reg_known_zero(regs, BPF_REG_0);
+		mark_reg_known_zero(env, regs, BPF_REG_0);
 		regs[BPF_REG_0].off = 0;
 		/* remember map_ptr, so that check_map_access()
 		 * can check 'value_size' boundary of memory access
 		 * to map element returned from bpf_map_lookup_elem()
 		 */
 		if (meta.map_ptr == NULL) {
-			verbose("kernel subsystem misconfigured verifier\n");
+			verbose(env,
+				"kernel subsystem misconfigured verifier\n");
 			return -EINVAL;
 		}
 		regs[BPF_REG_0].map_ptr = meta.map_ptr;
@@ -1728,12 +1752,12 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 		else if (insn_aux->map_ptr != meta.map_ptr)
 			insn_aux->map_ptr = BPF_MAP_PTR_POISON;
 	} else {
-		verbose("unknown return type %d of func %s#%d\n",
+		verbose(env, "unknown return type %d of func %s#%d\n",
 			fn->ret_type, func_id_name(func_id), func_id);
 		return -EINVAL;
 	}
 
-	err = check_map_func_compatibility(meta.map_ptr, func_id);
+	err = check_map_func_compatibility(env, meta.map_ptr, func_id);
 	if (err)
 		return err;
 
@@ -1780,7 +1804,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 				   const struct bpf_reg_state *ptr_reg,
 				   const struct bpf_reg_state *off_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
 	bool known = tnum_is_const(off_reg->var_off);
 	s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
 	    smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
@@ -1792,39 +1816,42 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 	dst_reg = &regs[dst];
 
 	if (WARN_ON_ONCE(known && (smin_val != smax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad sbounds\n");
+		print_verifier_state(env, env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad sbounds\n");
 		return -EINVAL;
 	}
 	if (WARN_ON_ONCE(known && (umin_val != umax_val))) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: known but bad ubounds\n");
+		print_verifier_state(env, env->cur_state);
+		verbose(env,
+			"verifier internal error: known but bad ubounds\n");
 		return -EINVAL;
 	}
 
 	if (BPF_CLASS(insn->code) != BPF_ALU64) {
 		/* 32-bit ALU ops on pointers produce (meaningless) scalars */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d 32-bit pointer arithmetic prohibited\n",
+			verbose(env,
+				"R%d 32-bit pointer arithmetic prohibited\n",
 				dst);
 		return -EACCES;
 	}
 
 	if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == CONST_PTR_TO_MAP) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
+			verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n",
 				dst);
 		return -EACCES;
 	}
 	if (ptr_reg->type == PTR_TO_PACKET_END) {
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
+			verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n",
 				dst);
 		return -EACCES;
 	}
@@ -1879,7 +1906,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			dst_reg->range = 0;
@@ -1889,7 +1916,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		if (dst_reg == off_reg) {
 			/* scalar -= pointer.  Creates an unknown scalar */
 			if (!env->allow_ptr_leaks)
-				verbose("R%d tried to subtract pointer from scalar\n",
+				verbose(env, "R%d tried to subtract pointer from scalar\n",
 					dst);
 			return -EACCES;
 		}
@@ -1899,7 +1926,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 */
 		if (ptr_reg->type == PTR_TO_STACK) {
 			if (!env->allow_ptr_leaks)
-				verbose("R%d subtraction from stack pointer prohibited\n",
+				verbose(env, "R%d subtraction from stack pointer prohibited\n",
 					dst);
 			return -EACCES;
 		}
@@ -1939,7 +1966,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		}
 		dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off);
 		dst_reg->off = ptr_reg->off;
-		if (ptr_reg->type == PTR_TO_PACKET) {
+		if (reg_is_pkt_pointer(ptr_reg)) {
 			dst_reg->id = ++env->id_gen;
 			/* something was added to pkt_ptr, set range to zero */
 			if (smin_val < 0)
@@ -1954,13 +1981,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
 		 * ptr &= ~3 which would reduce min_value by 3.)
 		 */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d bitwise operator %s on pointer prohibited\n",
+			verbose(env, "R%d bitwise operator %s on pointer prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	default:
 		/* other operators (e.g. MUL,LSH) produce non-pointer results */
 		if (!env->allow_ptr_leaks)
-			verbose("R%d pointer arithmetic with %s operator prohibited\n",
+			verbose(env, "R%d pointer arithmetic with %s operator prohibited\n",
 				dst, bpf_alu_string[opcode >> 4]);
 		return -EACCES;
 	}
@@ -1976,7 +2003,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 				      struct bpf_reg_state *dst_reg,
 				      struct bpf_reg_state src_reg)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	bool src_known, dst_known;
 	s64 smin_val, smax_val;
@@ -2126,7 +2153,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* We lose all sign bit information (except what we can pick
@@ -2154,7 +2181,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 			/* Shifts greater than 63 are undefined.  This includes
 			 * shifts by a negative number.
 			 */
-			mark_reg_unknown(regs, insn->dst_reg);
+			mark_reg_unknown(env, regs, insn->dst_reg);
 			break;
 		}
 		/* BPF_RSH is an unsigned shift, so make the appropriate casts */
@@ -2182,7 +2209,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 		__update_reg_bounds(dst_reg);
 		break;
 	default:
-		mark_reg_unknown(regs, insn->dst_reg);
+		mark_reg_unknown(env, regs, insn->dst_reg);
 		break;
 	}
 
@@ -2197,7 +2224,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
 static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				   struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+	struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
 	struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
 	u8 opcode = BPF_OP(insn->code);
 	int rc;
@@ -2214,12 +2241,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 				 * an arbitrary scalar.
 				 */
 				if (!env->allow_ptr_leaks) {
-					verbose("R%d pointer %s pointer prohibited\n",
+					verbose(env, "R%d pointer %s pointer prohibited\n",
 						insn->dst_reg,
 						bpf_alu_string[opcode >> 4]);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				return 0;
 			} else {
 				/* scalar += pointer
@@ -2271,13 +2298,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 
 	/* Got here implies adding two SCALAR_VALUEs */
 	if (WARN_ON_ONCE(ptr_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: unexpected ptr_reg\n");
+		print_verifier_state(env, env->cur_state);
+		verbose(env, "verifier internal error: unexpected ptr_reg\n");
 		return -EINVAL;
 	}
 	if (WARN_ON(!src_reg)) {
-		print_verifier_state(&env->cur_state);
-		verbose("verifier internal error: no src_reg\n");
+		print_verifier_state(env, env->cur_state);
+		verbose(env, "verifier internal error: no src_reg\n");
 		return -EINVAL;
 	}
 	return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg);
@@ -2286,7 +2313,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
 /* check validity of 32-bit and 64-bit arithmetic operations */
 static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
@@ -2295,14 +2322,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			if (BPF_SRC(insn->code) != 0 ||
 			    insn->src_reg != BPF_REG_0 ||
 			    insn->off != 0 || insn->imm != 0) {
-				verbose("BPF_NEG uses reserved fields\n");
+				verbose(env, "BPF_NEG uses reserved fields\n");
 				return -EINVAL;
 			}
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
 			    (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
 			    BPF_CLASS(insn->code) == BPF_ALU64) {
-				verbose("BPF_END uses reserved fields\n");
+				verbose(env, "BPF_END uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2313,7 +2340,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			return err;
 
 		if (is_pointer_value(env, insn->dst_reg)) {
-			verbose("R%d pointer arithmetic prohibited\n",
+			verbose(env, "R%d pointer arithmetic prohibited\n",
 				insn->dst_reg);
 			return -EACCES;
 		}
@@ -2327,7 +2354,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 
@@ -2337,7 +2364,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_MOV uses reserved fields\n");
+				verbose(env, "BPF_MOV uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2357,11 +2384,12 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			} else {
 				/* R1 = (u32) R2 */
 				if (is_pointer_value(env, insn->src_reg)) {
-					verbose("R%d partial copy of pointer\n",
+					verbose(env,
+						"R%d partial copy of pointer\n",
 						insn->src_reg);
 					return -EACCES;
 				}
-				mark_reg_unknown(regs, insn->dst_reg);
+				mark_reg_unknown(env, regs, insn->dst_reg);
 				/* high 32 bits are known zero. */
 				regs[insn->dst_reg].var_off = tnum_cast(
 						regs[insn->dst_reg].var_off, 4);
@@ -2376,14 +2404,14 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		}
 
 	} else if (opcode > BPF_END) {
-		verbose("invalid BPF_ALU opcode %x\n", opcode);
+		verbose(env, "invalid BPF_ALU opcode %x\n", opcode);
 		return -EINVAL;
 
 	} else {	/* all other ALU ops: and, sub, xor, add, ... */
 
 		if (BPF_SRC(insn->code) == BPF_X) {
 			if (insn->imm != 0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src1 operand */
@@ -2392,7 +2420,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 				return err;
 		} else {
 			if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
-				verbose("BPF_ALU uses reserved fields\n");
+				verbose(env, "BPF_ALU uses reserved fields\n");
 				return -EINVAL;
 			}
 		}
@@ -2404,7 +2432,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 		if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
 		    BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
-			verbose("div by zero\n");
+			verbose(env, "div by zero\n");
 			return -EINVAL;
 		}
 
@@ -2413,7 +2441,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 			int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32;
 
 			if (insn->imm < 0 || insn->imm >= size) {
-				verbose("invalid shift %d\n", insn->imm);
+				verbose(env, "invalid shift %d\n", insn->imm);
 				return -EINVAL;
 			}
 		}
@@ -2431,6 +2459,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 				   struct bpf_reg_state *dst_reg,
+				   enum bpf_reg_type type,
 				   bool range_right_open)
 {
 	struct bpf_reg_state *regs = state->regs, *reg;
@@ -2501,15 +2530,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
 	 * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16.
 	 */
 	for (i = 0; i < MAX_BPF_REG; i++)
-		if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
+		if (regs[i].type == type && regs[i].id == dst_reg->id)
 			/* keep the maximum range already checked */
 			regs[i].range = max(regs[i].range, new_range);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		reg = &state->spilled_regs[i / BPF_REG_SIZE];
-		if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
+		reg = &state->stack[i].spilled_ptr;
+		if (reg->type == type && reg->id == dst_reg->id)
 			reg->range = max(reg->range, new_range);
 	}
 }
@@ -2758,29 +2787,122 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
 	for (i = 0; i < MAX_BPF_REG; i++)
 		mark_map_reg(regs, i, id, is_null);
 
-	for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
-		if (state->stack_slot_type[i] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+		mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
+	}
+}
+
+static bool try_match_pkt_pointers(const struct bpf_insn *insn,
+				   struct bpf_reg_state *dst_reg,
+				   struct bpf_reg_state *src_reg,
+				   struct bpf_verifier_state *this_branch,
+				   struct bpf_verifier_state *other_branch)
+{
+	if (BPF_SRC(insn->code) != BPF_X)
+		return false;
+
+	switch (BPF_OP(insn->code)) {
+	case BPF_JGT:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' > pkt_end, pkt_meta' > pkt_data */
+			find_good_pkt_pointers(this_branch, dst_reg,
+					       dst_reg->type, false);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end > pkt_data', pkt_data > pkt_meta' */
+			find_good_pkt_pointers(other_branch, src_reg,
+					       src_reg->type, true);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JLT:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' < pkt_end, pkt_meta' < pkt_data */
+			find_good_pkt_pointers(other_branch, dst_reg,
+					       dst_reg->type, true);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end < pkt_data', pkt_data > pkt_meta' */
+			find_good_pkt_pointers(this_branch, src_reg,
+					       src_reg->type, false);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JGE:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */
+			find_good_pkt_pointers(this_branch, dst_reg,
+					       dst_reg->type, true);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end >= pkt_data', pkt_data >= pkt_meta' */
+			find_good_pkt_pointers(other_branch, src_reg,
+					       src_reg->type, false);
+		} else {
+			return false;
+		}
+		break;
+	case BPF_JLE:
+		if ((dst_reg->type == PTR_TO_PACKET &&
+		     src_reg->type == PTR_TO_PACKET_END) ||
+		    (dst_reg->type == PTR_TO_PACKET_META &&
+		     reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) {
+			/* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */
+			find_good_pkt_pointers(other_branch, dst_reg,
+					       dst_reg->type, false);
+		} else if ((dst_reg->type == PTR_TO_PACKET_END &&
+			    src_reg->type == PTR_TO_PACKET) ||
+			   (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) &&
+			    src_reg->type == PTR_TO_PACKET_META)) {
+			/* pkt_end <= pkt_data', pkt_data <= pkt_meta' */
+			find_good_pkt_pointers(this_branch, src_reg,
+					       src_reg->type, true);
+		} else {
+			return false;
+		}
+		break;
+	default:
+		return false;
 	}
+
+	return true;
 }
 
 static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			     struct bpf_insn *insn, int *insn_idx)
 {
-	struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+	struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
 	struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
 	u8 opcode = BPF_OP(insn->code);
 	int err;
 
 	if (opcode > BPF_JSLE) {
-		verbose("invalid BPF_JMP opcode %x\n", opcode);
+		verbose(env, "invalid BPF_JMP opcode %x\n", opcode);
 		return -EINVAL;
 	}
 
 	if (BPF_SRC(insn->code) == BPF_X) {
 		if (insn->imm != 0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -2790,13 +2912,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 			return err;
 
 		if (is_pointer_value(env, insn->src_reg)) {
-			verbose("R%d pointer comparison prohibited\n",
+			verbose(env, "R%d pointer comparison prohibited\n",
 				insn->src_reg);
 			return -EACCES;
 		}
 	} else {
 		if (insn->src_reg != BPF_REG_0) {
-			verbose("BPF_JMP uses reserved fields\n");
+			verbose(env, "BPF_JMP uses reserved fields\n");
 			return -EINVAL;
 		}
 	}
@@ -2871,52 +2993,15 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
 		 */
 		mark_map_regs(this_branch, insn->dst_reg, opcode == BPF_JNE);
 		mark_map_regs(other_branch, insn->dst_reg, opcode == BPF_JEQ);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' > pkt_end */
-		find_good_pkt_pointers(this_branch, dst_reg, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end > pkt_data' */
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg], true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' < pkt_end */
-		find_good_pkt_pointers(other_branch, dst_reg, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLT &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end < pkt_data' */
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg], false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' >= pkt_end */
-		find_good_pkt_pointers(this_branch, dst_reg, true);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGE &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end >= pkt_data' */
-		find_good_pkt_pointers(other_branch, &regs[insn->src_reg], false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   dst_reg->type == PTR_TO_PACKET &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET_END) {
-		/* pkt_data' <= pkt_end */
-		find_good_pkt_pointers(other_branch, dst_reg, false);
-	} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JLE &&
-		   dst_reg->type == PTR_TO_PACKET_END &&
-		   regs[insn->src_reg].type == PTR_TO_PACKET) {
-		/* pkt_end <= pkt_data' */
-		find_good_pkt_pointers(this_branch, &regs[insn->src_reg], true);
-	} else if (is_pointer_value(env, insn->dst_reg)) {
-		verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+	} else if (!try_match_pkt_pointers(insn, dst_reg, &regs[insn->src_reg],
+					   this_branch, other_branch) &&
+		   is_pointer_value(env, insn->dst_reg)) {
+		verbose(env, "R%d pointer comparison prohibited\n",
+			insn->dst_reg);
 		return -EACCES;
 	}
-	if (log_level)
-		print_verifier_state(this_branch);
+	if (env->log.level)
+		print_verifier_state(env, this_branch);
 	return 0;
 }
 
@@ -2931,15 +3016,15 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
 /* verify BPF_LD_IMM64 instruction */
 static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	int err;
 
 	if (BPF_SIZE(insn->code) != BPF_DW) {
-		verbose("invalid BPF_LD_IMM insn\n");
+		verbose(env, "invalid BPF_LD_IMM insn\n");
 		return -EINVAL;
 	}
 	if (insn->off != 0) {
-		verbose("BPF_LD_IMM64 uses reserved fields\n");
+		verbose(env, "BPF_LD_IMM64 uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -2992,19 +3077,19 @@ static bool may_access_skb(enum bpf_prog_type type)
  */
 static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 {
-	struct bpf_reg_state *regs = env->cur_state.regs;
+	struct bpf_reg_state *regs = cur_regs(env);
 	u8 mode = BPF_MODE(insn->code);
 	int i, err;
 
 	if (!may_access_skb(env->prog->type)) {
-		verbose("BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
+		verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n");
 		return -EINVAL;
 	}
 
 	if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
 	    BPF_SIZE(insn->code) == BPF_DW ||
 	    (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
-		verbose("BPF_LD_[ABS|IND] uses reserved fields\n");
+		verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n");
 		return -EINVAL;
 	}
 
@@ -3014,7 +3099,8 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 		return err;
 
 	if (regs[BPF_REG_6].type != PTR_TO_CTX) {
-		verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
+		verbose(env,
+			"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
 		return -EINVAL;
 	}
 
@@ -3027,7 +3113,7 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 
 	/* reset caller saved regs to unreadable */
 	for (i = 0; i < CALLER_SAVED_REGS; i++) {
-		mark_reg_not_init(regs, caller_saved[i]);
+		mark_reg_not_init(env, regs, caller_saved[i]);
 		check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
 	}
 
@@ -3035,7 +3121,45 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
 	 * the value fetched from the packet.
 	 * Already marked as written above.
 	 */
-	mark_reg_unknown(regs, BPF_REG_0);
+	mark_reg_unknown(env, regs, BPF_REG_0);
+	return 0;
+}
+
+static int check_return_code(struct bpf_verifier_env *env)
+{
+	struct bpf_reg_state *reg;
+	struct tnum range = tnum_range(0, 1);
+
+	switch (env->prog->type) {
+	case BPF_PROG_TYPE_CGROUP_SKB:
+	case BPF_PROG_TYPE_CGROUP_SOCK:
+	case BPF_PROG_TYPE_SOCK_OPS:
+	case BPF_PROG_TYPE_CGROUP_DEVICE:
+		break;
+	default:
+		return 0;
+	}
+
+	reg = cur_regs(env) + BPF_REG_0;
+	if (reg->type != SCALAR_VALUE) {
+		verbose(env, "At program exit the register R0 is not a known value (%s)\n",
+			reg_type_str[reg->type]);
+		return -EINVAL;
+	}
+
+	if (!tnum_in(range, reg->var_off)) {
+		verbose(env, "At program exit the register R0 ");
+		if (!tnum_is_unknown(reg->var_off)) {
+			char tn_buf[48];
+
+			tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+			verbose(env, "has value %s", tn_buf);
+		} else {
+			verbose(env, "has unknown scalar value");
+		}
+		verbose(env, " should have been 0 or 1\n");
+		return -EINVAL;
+	}
 	return 0;
 }
 
@@ -3099,7 +3223,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		return 0;
 
 	if (w < 0 || w >= env->prog->len) {
-		verbose("jump out of range from insn %d to %d\n", t, w);
+		verbose(env, "jump out of range from insn %d to %d\n", t, w);
 		return -EINVAL;
 	}
 
@@ -3116,13 +3240,13 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env)
 		insn_stack[cur_stack++] = w;
 		return 1;
 	} else if ((insn_state[w] & 0xF0) == DISCOVERED) {
-		verbose("back-edge from insn %d to %d\n", t, w);
+		verbose(env, "back-edge from insn %d to %d\n", t, w);
 		return -EINVAL;
 	} else if (insn_state[w] == EXPLORED) {
 		/* forward- or cross-edge */
 		insn_state[t] = DISCOVERED | e;
 	} else {
-		verbose("insn state internal bug\n");
+		verbose(env, "insn state internal bug\n");
 		return -EFAULT;
 	}
 	return 0;
@@ -3216,7 +3340,7 @@ peek_stack:
 mark_explored:
 	insn_state[t] = EXPLORED;
 	if (cur_stack-- <= 0) {
-		verbose("pop stack internal bug\n");
+		verbose(env, "pop stack internal bug\n");
 		ret = -EFAULT;
 		goto err_free;
 	}
@@ -3225,7 +3349,7 @@ mark_explored:
 check_state:
 	for (i = 0; i < insn_cnt; i++) {
 		if (insn_state[i] != EXPLORED) {
-			verbose("unreachable insn %d\n", i);
+			verbose(env, "unreachable insn %d\n", i);
 			ret = -EINVAL;
 			goto err_free;
 		}
@@ -3340,8 +3464,9 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 			return false;
 		/* Check our ids match any regs they're supposed to */
 		return check_ids(rold->id, rcur->id, idmap);
+	case PTR_TO_PACKET_META:
 	case PTR_TO_PACKET:
-		if (rcur->type != PTR_TO_PACKET)
+		if (rcur->type != rold->type)
 			return false;
 		/* We must have at least as much range as the old ptr
 		 * did, so that any accesses which were safe before are
@@ -3379,6 +3504,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
 	return false;
 }
 
+static bool stacksafe(struct bpf_verifier_state *old,
+		      struct bpf_verifier_state *cur,
+		      struct idpair *idmap)
+{
+	int i, spi;
+
+	/* if explored stack has more populated slots than current stack
+	 * such stacks are not equivalent
+	 */
+	if (old->allocated_stack > cur->allocated_stack)
+		return false;
+
+	/* walk slots of the explored stack and ignore any additional
+	 * slots in the current stack, since explored(safe) state
+	 * didn't use them
+	 */
+	for (i = 0; i < old->allocated_stack; i++) {
+		spi = i / BPF_REG_SIZE;
+
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+			continue;
+		if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+		    cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+			/* Ex: old explored (safe) state has STACK_SPILL in
+			 * this stack slot, but current has has STACK_MISC ->
+			 * this verifier states are not equivalent,
+			 * return false to continue verification of this path
+			 */
+			return false;
+		if (i % BPF_REG_SIZE)
+			continue;
+		if (old->stack[spi].slot_type[0] != STACK_SPILL)
+			continue;
+		if (!regsafe(&old->stack[spi].spilled_ptr,
+			     &cur->stack[spi].spilled_ptr,
+			     idmap))
+			/* when explored and current stack slot are both storing
+			 * spilled registers, check that stored pointers types
+			 * are the same as well.
+			 * Ex: explored safe path could have stored
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+			 * but current path has stored:
+			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+			 * such verifier states are not equivalent.
+			 * return false to continue verification of this path
+			 */
+			return false;
+	}
+	return true;
+}
+
 /* compare two verifier states
  *
  * all states stored in state_list are known to be valid, since
@@ -3423,37 +3599,8 @@ static bool states_equal(struct bpf_verifier_env *env,
 			goto out_free;
 	}
 
-	for (i = 0; i < MAX_BPF_STACK; i++) {
-		if (old->stack_slot_type[i] == STACK_INVALID)
-			continue;
-		if (old->stack_slot_type[i] != cur->stack_slot_type[i])
-			/* Ex: old explored (safe) state has STACK_SPILL in
-			 * this stack slot, but current has has STACK_MISC ->
-			 * this verifier states are not equivalent,
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		if (i % BPF_REG_SIZE)
-			continue;
-		if (old->stack_slot_type[i] != STACK_SPILL)
-			continue;
-		if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
-			     &cur->spilled_regs[i / BPF_REG_SIZE],
-			     idmap))
-			/* when explored and current stack slot are both storing
-			 * spilled registers, check that stored pointers types
-			 * are the same as well.
-			 * Ex: explored safe path could have stored
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
-			 * but current path has stored:
-			 * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
-			 * such verifier states are not equivalent.
-			 * return false to continue verification of this path
-			 */
-			goto out_free;
-		else
-			continue;
-	}
+	if (!stacksafe(old, cur, idmap))
+		goto out_free;
 	ret = true;
 out_free:
 	kfree(idmap);
@@ -3489,17 +3636,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
 		}
 	}
 	/* ... and stack slots */
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
-		if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+	for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+		    i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+		if (parent->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+		if (state->stack[i].slot_type[0] != STACK_SPILL)
 			continue;
-		if (parent->spilled_regs[i].live & REG_LIVE_READ)
+		if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
 			continue;
-		if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+		if (writes &&
+		    (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
 			continue;
-		if (state->spilled_regs[i].live & REG_LIVE_READ) {
-			parent->spilled_regs[i].live |= REG_LIVE_READ;
+		if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+			parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
 			touched = true;
 		}
 	}
@@ -3529,7 +3678,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 {
 	struct bpf_verifier_state_list *new_sl;
 	struct bpf_verifier_state_list *sl;
-	int i;
+	struct bpf_verifier_state *cur = env->cur_state;
+	int i, err;
 
 	sl = env->explored_states[insn_idx];
 	if (!sl)
@@ -3539,7 +3689,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 		return 0;
 
 	while (sl != STATE_LIST_MARK) {
-		if (states_equal(env, &sl->state, &env->cur_state)) {
+		if (states_equal(env, &sl->state, cur)) {
 			/* reached equivalent register/stack state,
 			 * prune the search.
 			 * Registers read by the continuation are read by us.
@@ -3550,7 +3700,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 			 * they'll be immediately forgotten as we're pruning
 			 * this state and will pop a new one.
 			 */
-			propagate_liveness(&sl->state, &env->cur_state);
+			propagate_liveness(&sl->state, cur);
 			return 1;
 		}
 		sl = sl->next;
@@ -3562,16 +3712,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * it will be rejected. Since there are no loops, we won't be
 	 * seeing this 'insn_idx' instruction again on the way to bpf_exit
 	 */
-	new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+	new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
 	if (!new_sl)
 		return -ENOMEM;
 
 	/* add new state to the head of linked list */
-	memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+	err = copy_verifier_state(&new_sl->state, cur);
+	if (err) {
+		free_verifier_state(&new_sl->state, false);
+		kfree(new_sl);
+		return err;
+	}
 	new_sl->next = env->explored_states[insn_idx];
 	env->explored_states[insn_idx] = new_sl;
 	/* connect new state to parentage chain */
-	env->cur_state.parent = &new_sl->state;
+	cur->parent = &new_sl->state;
 	/* clear write marks in current state: the writes we did are not writes
 	 * our child did, so they don't screen off its reads from us.
 	 * (There are no read marks in current state, because reads always mark
@@ -3579,33 +3734,37 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
 	 * explored_states can get read marks.)
 	 */
 	for (i = 0; i < BPF_REG_FP; i++)
-		env->cur_state.regs[i].live = REG_LIVE_NONE;
-	for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
-		if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
-			env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+		cur->regs[i].live = REG_LIVE_NONE;
+	for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+		if (cur->stack[i].slot_type[0] == STACK_SPILL)
+			cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
 	return 0;
 }
 
 static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
 				  int insn_idx, int prev_insn_idx)
 {
-	if (!env->analyzer_ops || !env->analyzer_ops->insn_hook)
-		return 0;
+	if (env->dev_ops && env->dev_ops->insn_hook)
+		return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx);
 
-	return env->analyzer_ops->insn_hook(env, insn_idx, prev_insn_idx);
+	return 0;
 }
 
 static int do_check(struct bpf_verifier_env *env)
 {
-	struct bpf_verifier_state *state = &env->cur_state;
+	struct bpf_verifier_state *state;
 	struct bpf_insn *insns = env->prog->insnsi;
-	struct bpf_reg_state *regs = state->regs;
+	struct bpf_reg_state *regs;
 	int insn_cnt = env->prog->len;
 	int insn_idx, prev_insn_idx = 0;
 	int insn_processed = 0;
 	bool do_print_state = false;
 
-	init_reg_state(regs);
+	state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+	if (!state)
+		return -ENOMEM;
+	env->cur_state = state;
+	init_reg_state(env, state->regs);
 	state->parent = NULL;
 	insn_idx = 0;
 	for (;;) {
@@ -3614,7 +3773,7 @@ static int do_check(struct bpf_verifier_env *env)
 		int err;
 
 		if (insn_idx >= insn_cnt) {
-			verbose("invalid insn idx %d insn_cnt %d\n",
+			verbose(env, "invalid insn idx %d insn_cnt %d\n",
 				insn_idx, insn_cnt);
 			return -EFAULT;
 		}
@@ -3623,7 +3782,8 @@ static int do_check(struct bpf_verifier_env *env)
 		class = BPF_CLASS(insn->code);
 
 		if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
-			verbose("BPF program is too large. Processed %d insn\n",
+			verbose(env,
+				"BPF program is too large. Processed %d insn\n",
 				insn_processed);
 			return -E2BIG;
 		}
@@ -3633,12 +3793,12 @@ static int do_check(struct bpf_verifier_env *env)
 			return err;
 		if (err == 1) {
 			/* found equivalent state, can prune the search */
-			if (log_level) {
+			if (env->log.level) {
 				if (do_print_state)
-					verbose("\nfrom %d to %d: safe\n",
+					verbose(env, "\nfrom %d to %d: safe\n",
 						prev_insn_idx, insn_idx);
 				else
-					verbose("%d: safe\n", insn_idx);
+					verbose(env, "%d: safe\n", insn_idx);
 			}
 			goto process_bpf_exit;
 		}
@@ -3646,25 +3806,28 @@ static int do_check(struct bpf_verifier_env *env)
 		if (need_resched())
 			cond_resched();
 
-		if (log_level > 1 || (log_level && do_print_state)) {
-			if (log_level > 1)
-				verbose("%d:", insn_idx);
+		if (env->log.level > 1 || (env->log.level && do_print_state)) {
+			if (env->log.level > 1)
+				verbose(env, "%d:", insn_idx);
 			else
-				verbose("\nfrom %d to %d:",
+				verbose(env, "\nfrom %d to %d:",
 					prev_insn_idx, insn_idx);
-			print_verifier_state(&env->cur_state);
+			print_verifier_state(env, state);
 			do_print_state = false;
 		}
 
-		if (log_level) {
-			verbose("%d: ", insn_idx);
-			print_bpf_insn(env, insn);
+		if (env->log.level) {
+			verbose(env, "%d: ", insn_idx);
+			print_bpf_insn(verbose, env, insn,
+				       env->allow_ptr_leaks);
 		}
 
 		err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
 		if (err)
 			return err;
 
+		regs = cur_regs(env);
+		env->insn_aux_data[insn_idx].seen = true;
 		if (class == BPF_ALU || class == BPF_ALU64) {
 			err = check_alu_op(env, insn);
 			if (err)
@@ -3714,7 +3877,7 @@ static int do_check(struct bpf_verifier_env *env)
 				 * src_reg == stack|map in some other branch.
 				 * Reject it.
 				 */
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
@@ -3754,14 +3917,14 @@ static int do_check(struct bpf_verifier_env *env)
 			} else if (dst_reg_type != *prev_dst_type &&
 				   (dst_reg_type == PTR_TO_CTX ||
 				    *prev_dst_type == PTR_TO_CTX)) {
-				verbose("same insn cannot be used with different pointers\n");
+				verbose(env, "same insn cannot be used with different pointers\n");
 				return -EINVAL;
 			}
 
 		} else if (class == BPF_ST) {
 			if (BPF_MODE(insn->code) != BPF_MEM ||
 			    insn->src_reg != BPF_REG_0) {
-				verbose("BPF_ST uses reserved fields\n");
+				verbose(env, "BPF_ST uses reserved fields\n");
 				return -EINVAL;
 			}
 			/* check src operand */
@@ -3784,7 +3947,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->off != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_CALL uses reserved fields\n");
+					verbose(env, "BPF_CALL uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3797,7 +3960,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_JA uses reserved fields\n");
+					verbose(env, "BPF_JA uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3809,7 +3972,7 @@ static int do_check(struct bpf_verifier_env *env)
 				    insn->imm != 0 ||
 				    insn->src_reg != BPF_REG_0 ||
 				    insn->dst_reg != BPF_REG_0) {
-					verbose("BPF_EXIT uses reserved fields\n");
+					verbose(env, "BPF_EXIT uses reserved fields\n");
 					return -EINVAL;
 				}
 
@@ -3824,13 +3987,18 @@ static int do_check(struct bpf_verifier_env *env)
 					return err;
 
 				if (is_pointer_value(env, BPF_REG_0)) {
-					verbose("R0 leaks addr as return value\n");
+					verbose(env, "R0 leaks addr as return value\n");
 					return -EACCES;
 				}
 
+				err = check_return_code(env);
+				if (err)
+					return err;
 process_bpf_exit:
-				insn_idx = pop_stack(env, &prev_insn_idx);
-				if (insn_idx < 0) {
+				err = pop_stack(env, &prev_insn_idx, &insn_idx);
+				if (err < 0) {
+					if (err != -ENOENT)
+						return err;
 					break;
 				} else {
 					do_print_state = true;
@@ -3855,20 +4023,21 @@ process_bpf_exit:
 					return err;
 
 				insn_idx++;
+				env->insn_aux_data[insn_idx].seen = true;
 			} else {
-				verbose("invalid BPF_LD mode\n");
+				verbose(env, "invalid BPF_LD mode\n");
 				return -EINVAL;
 			}
 		} else {
-			verbose("unknown insn class %d\n", class);
+			verbose(env, "unknown insn class %d\n", class);
 			return -EINVAL;
 		}
 
 		insn_idx++;
 	}
 
-	verbose("processed %d insns, stack depth %d\n",
-		insn_processed, env->prog->aux->stack_depth);
+	verbose(env, "processed %d insns, stack depth %d\n", insn_processed,
+		env->prog->aux->stack_depth);
 	return 0;
 }
 
@@ -3880,7 +4049,8 @@ static int check_map_prealloc(struct bpf_map *map)
 		!(map->map_flags & BPF_F_NO_PREALLOC);
 }
 
-static int check_map_prog_compatibility(struct bpf_map *map,
+static int check_map_prog_compatibility(struct bpf_verifier_env *env,
+					struct bpf_map *map,
 					struct bpf_prog *prog)
 
 {
@@ -3891,12 +4061,12 @@ static int check_map_prog_compatibility(struct bpf_map *map,
 	 */
 	if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
 		if (!check_map_prealloc(map)) {
-			verbose("perf_event programs can only use preallocated hash map\n");
+			verbose(env, "perf_event programs can only use preallocated hash map\n");
 			return -EINVAL;
 		}
 		if (map->inner_map_meta &&
 		    !check_map_prealloc(map->inner_map_meta)) {
-			verbose("perf_event programs can only use preallocated inner hash map\n");
+			verbose(env, "perf_event programs can only use preallocated inner hash map\n");
 			return -EINVAL;
 		}
 	}
@@ -3919,14 +4089,14 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 	for (i = 0; i < insn_cnt; i++, insn++) {
 		if (BPF_CLASS(insn->code) == BPF_LDX &&
 		    (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) {
-			verbose("BPF_LDX uses reserved fields\n");
+			verbose(env, "BPF_LDX uses reserved fields\n");
 			return -EINVAL;
 		}
 
 		if (BPF_CLASS(insn->code) == BPF_STX &&
 		    ((BPF_MODE(insn->code) != BPF_MEM &&
 		      BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) {
-			verbose("BPF_STX uses reserved fields\n");
+			verbose(env, "BPF_STX uses reserved fields\n");
 			return -EINVAL;
 		}
 
@@ -3937,7 +4107,7 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 			if (i == insn_cnt - 1 || insn[1].code != 0 ||
 			    insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
 			    insn[1].off != 0) {
-				verbose("invalid bpf_ld_imm64 insn\n");
+				verbose(env, "invalid bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
@@ -3946,19 +4116,20 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
 				goto next_insn;
 
 			if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
-				verbose("unrecognized bpf_ld_imm64 insn\n");
+				verbose(env,
+					"unrecognized bpf_ld_imm64 insn\n");
 				return -EINVAL;
 			}
 
 			f = fdget(insn->imm);
 			map = __bpf_map_get(f);
 			if (IS_ERR(map)) {
-				verbose("fd %d is not pointing to valid bpf_map\n",
+				verbose(env, "fd %d is not pointing to valid bpf_map\n",
 					insn->imm);
 				return PTR_ERR(map);
 			}
 
-			err = check_map_prog_compatibility(map, env->prog);
+			err = check_map_prog_compatibility(env, map, env->prog);
 			if (err) {
 				fdput(f);
 				return err;
@@ -4035,6 +4206,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 				u32 off, u32 cnt)
 {
 	struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+	int i;
 
 	if (cnt == 1)
 		return 0;
@@ -4044,6 +4216,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
 	memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
 	memcpy(new_data + off + cnt - 1, old_data + off,
 	       sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+	for (i = off; i < off + cnt - 1; i++)
+		new_data[i].seen = true;
 	env->insn_aux_data = new_data;
 	vfree(old_data);
 	return 0;
@@ -4062,12 +4236,31 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
 	return new_prog;
 }
 
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+	struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+	struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+	struct bpf_insn *insn = env->prog->insnsi;
+	const int insn_cnt = env->prog->len;
+	int i;
+
+	for (i = 0; i < insn_cnt; i++) {
+		if (aux_data[i].seen)
+			continue;
+		memcpy(insn + i, &nop, sizeof(nop));
+	}
+}
+
 /* convert load instructions that access fields of 'struct __sk_buff'
  * into sequence of instructions that access fields of 'struct sk_buff'
  */
 static int convert_ctx_accesses(struct bpf_verifier_env *env)
 {
-	const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+	const struct bpf_verifier_ops *ops = env->ops;
 	int i, cnt, size, ctx_field_size, delta = 0;
 	const int insn_cnt = env->prog->len;
 	struct bpf_insn insn_buf[16], *insn;
@@ -4080,7 +4273,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 		cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
 					env->prog);
 		if (cnt >= ARRAY_SIZE(insn_buf)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		} else if (cnt) {
 			new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
@@ -4128,7 +4321,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 			u8 size_code;
 
 			if (type == BPF_WRITE) {
-				verbose("bpf verifier narrow ctx access misconfigured\n");
+				verbose(env, "bpf verifier narrow ctx access misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4147,7 +4340,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
 					      &target_size);
 		if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
 		    (ctx_field_size && !target_size)) {
-			verbose("bpf verifier is misconfigured\n");
+			verbose(env, "bpf verifier is misconfigured\n");
 			return -EINVAL;
 		}
 
@@ -4229,7 +4422,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 
 			cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf);
 			if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
-				verbose("bpf verifier is misconfigured\n");
+				verbose(env, "bpf verifier is misconfigured\n");
 				return -EINVAL;
 			}
 
@@ -4268,12 +4461,13 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
 			insn      = new_prog->insnsi + i + delta;
 		}
 patch_call_imm:
-		fn = prog->aux->ops->get_func_proto(insn->imm);
+		fn = env->ops->get_func_proto(insn->imm);
 		/* all functions that have prototype and verifier allowed
 		 * programs to call them, must be real in-kernel functions
 		 */
 		if (!fn->func) {
-			verbose("kernel subsystem misconfigured func %s#%d\n",
+			verbose(env,
+				"kernel subsystem misconfigured func %s#%d\n",
 				func_id_name(insn->imm), insn->imm);
 			return -EFAULT;
 		}
@@ -4297,6 +4491,7 @@ static void free_states(struct bpf_verifier_env *env)
 		if (sl)
 			while (sl != STATE_LIST_MARK) {
 				sln = sl->next;
+				free_verifier_state(&sl->state, false);
 				kfree(sl);
 				sl = sln;
 			}
@@ -4307,16 +4502,21 @@ static void free_states(struct bpf_verifier_env *env)
 
 int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 {
-	char __user *log_ubuf = NULL;
 	struct bpf_verifier_env *env;
+	struct bpf_verifer_log *log;
 	int ret = -EINVAL;
 
+	/* no program is valid */
+	if (ARRAY_SIZE(bpf_verifier_ops) == 0)
+		return -EINVAL;
+
 	/* 'struct bpf_verifier_env' can be global, but since it's not small,
 	 * allocate/free it every time bpf_check() is called
 	 */
 	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
 	if (!env)
 		return -ENOMEM;
+	log = &env->log;
 
 	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
 				     (*prog)->len);
@@ -4324,6 +4524,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	if (!env->insn_aux_data)
 		goto err_free_env;
 	env->prog = *prog;
+	env->ops = bpf_verifier_ops[env->prog->type];
 
 	/* grab the mutex to protect few globals used by verifier */
 	mutex_lock(&bpf_verifier_lock);
@@ -4332,29 +4533,27 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 		/* user requested verbose verifier output
 		 * and supplied buffer to store the verification trace
 		 */
-		log_level = attr->log_level;
-		log_ubuf = (char __user *) (unsigned long) attr->log_buf;
-		log_size = attr->log_size;
-		log_len = 0;
+		log->level = attr->log_level;
+		log->ubuf = (char __user *) (unsigned long) attr->log_buf;
+		log->len_total = attr->log_size;
 
 		ret = -EINVAL;
-		/* log_* values have to be sane */
-		if (log_size < 128 || log_size > UINT_MAX >> 8 ||
-		    log_level == 0 || log_ubuf == NULL)
-			goto err_unlock;
-
-		ret = -ENOMEM;
-		log_buf = vmalloc(log_size);
-		if (!log_buf)
+		/* log attributes have to be sane */
+		if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 ||
+		    !log->level || !log->ubuf)
 			goto err_unlock;
-	} else {
-		log_level = 0;
 	}
 
 	env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT);
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
 		env->strict_alignment = true;
 
+	if (env->prog->aux->offload) {
+		ret = bpf_prog_offload_verifier_prep(env);
+		if (ret)
+			goto err_unlock;
+	}
+
 	ret = replace_map_fd_with_map_ptr(env);
 	if (ret < 0)
 		goto skip_full_check;
@@ -4373,29 +4572,30 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
 	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
 
 	ret = do_check(env);
+	if (env->cur_state) {
+		free_verifier_state(env->cur_state, true);
+		env->cur_state = NULL;
+	}
 
 skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
+	while (!pop_stack(env, NULL, NULL));
 	free_states(env);
 
 	if (ret == 0)
+		sanitize_dead_code(env);
+
+	if (ret == 0)
 		/* program is valid, convert *(u32*)(ctx + off) accesses */
 		ret = convert_ctx_accesses(env);
 
 	if (ret == 0)
 		ret = fixup_bpf_calls(env);
 
-	if (log_level && log_len >= log_size - 1) {
-		BUG_ON(log_len >= log_size);
-		/* verifier log exceeded user supplied buffer */
+	if (log->level && bpf_verifier_log_full(log))
 		ret = -ENOSPC;
-		/* fall through to return what was recorded */
-	}
-
-	/* copy verifier log back to user space including trailing zero */
-	if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+	if (log->level && !log->ubuf) {
 		ret = -EFAULT;
-		goto free_log_buf;
+		goto err_release_maps;
 	}
 
 	if (ret == 0 && env->used_map_cnt) {
@@ -4406,7 +4606,7 @@ skip_full_check:
 
 		if (!env->prog->aux->used_maps) {
 			ret = -ENOMEM;
-			goto free_log_buf;
+			goto err_release_maps;
 		}
 
 		memcpy(env->prog->aux->used_maps, env->used_maps,
@@ -4419,9 +4619,7 @@ skip_full_check:
 		convert_pseudo_ld_imm64(env);
 	}
 
-free_log_buf:
-	if (log_level)
-		vfree(log_buf);
+err_release_maps:
 	if (!env->prog->aux->used_maps)
 		/* if we didn't copy map pointers into bpf_prog_info, release
 		 * them now. Otherwise free_bpf_prog_info() will release them.
@@ -4435,58 +4633,3 @@ err_free_env:
 	kfree(env);
 	return ret;
 }
-
-int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
-		 void *priv)
-{
-	struct bpf_verifier_env *env;
-	int ret;
-
-	env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL);
-	if (!env)
-		return -ENOMEM;
-
-	env->insn_aux_data = vzalloc(sizeof(struct bpf_insn_aux_data) *
-				     prog->len);
-	ret = -ENOMEM;
-	if (!env->insn_aux_data)
-		goto err_free_env;
-	env->prog = prog;
-	env->analyzer_ops = ops;
-	env->analyzer_priv = priv;
-
-	/* grab the mutex to protect few globals used by verifier */
-	mutex_lock(&bpf_verifier_lock);
-
-	log_level = 0;
-
-	env->strict_alignment = false;
-	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
-		env->strict_alignment = true;
-
-	env->explored_states = kcalloc(env->prog->len,
-				       sizeof(struct bpf_verifier_state_list *),
-				       GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!env->explored_states)
-		goto skip_full_check;
-
-	ret = check_cfg(env);
-	if (ret < 0)
-		goto skip_full_check;
-
-	env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
-
-	ret = do_check(env);
-
-skip_full_check:
-	while (pop_stack(env, NULL) >= 0);
-	free_states(env);
-
-	mutex_unlock(&bpf_verifier_lock);
-	vfree(env->insn_aux_data);
-err_free_env:
-	kfree(env);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(bpf_analyzer);
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index ae448f7632cc..2be89a003185 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -1,5 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
-obj-y := cgroup.o namespace.o cgroup-v1.o
+obj-y := cgroup.o stat.o namespace.o cgroup-v1.o
 
 obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
 obj-$(CONFIG_CGROUP_PIDS) += pids.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bf54ade001be..b928b27050c6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -201,6 +201,15 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 int cgroup_task_count(const struct cgroup *cgrp);
 
 /*
+ * stat.c
+ */
+void cgroup_stat_flush(struct cgroup *cgrp);
+int cgroup_stat_init(struct cgroup *cgrp);
+void cgroup_stat_exit(struct cgroup *cgrp);
+void cgroup_stat_show_cputime(struct seq_file *seq);
+void cgroup_stat_boot(void);
+
+/*
  * namespace.c
  */
 extern const struct proc_ns_operations cgroupns_operations;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 44857278eb8a..0b1ffe147f24 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -142,12 +142,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
 };
 #undef SUBSYS
 
+static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat);
+
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
  * part of that cgroup.
  */
-struct cgroup_root cgrp_dfl_root;
+struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat };
 EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
@@ -462,6 +464,28 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
 }
 
 /**
+ * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem
+ * @cgrp: the cgroup of interest
+ * @ss: the subsystem of interest
+ *
+ * Find and get @cgrp's css assocaited with @ss.  If the css doesn't exist
+ * or is offline, %NULL is returned.
+ */
+static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
+						     struct cgroup_subsys *ss)
+{
+	struct cgroup_subsys_state *css;
+
+	rcu_read_lock();
+	css = cgroup_css(cgrp, ss);
+	if (!css || !css_tryget_online(css))
+		css = NULL;
+	rcu_read_unlock();
+
+	return css;
+}
+
+/**
  * cgroup_e_css - obtain a cgroup's effective css for the specified subsystem
  * @cgrp: the cgroup of interest
  * @ss: the subsystem of interest (%NULL returns @cgrp->self)
@@ -647,6 +671,14 @@ struct css_set init_css_set = {
 	.cgrp_links		= LIST_HEAD_INIT(init_css_set.cgrp_links),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+
+	/*
+	 * The following field is re-initialized when this cset gets linked
+	 * in cgroup_init().  However, let's initialize the field
+	 * statically too so that the default cgroup can be accessed safely
+	 * early during boot.
+	 */
+	.dfl_cgrp		= &cgrp_dfl_root.cgrp,
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -1896,6 +1928,9 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags)
 	if (ret)
 		goto destroy_root;
 
+	ret = cgroup_bpf_inherit(root_cgrp);
+	WARN_ON_ONCE(ret);
+
 	trace_cgroup_setup_root(root);
 
 	/*
@@ -3312,6 +3347,37 @@ static int cgroup_stat_show(struct seq_file *seq, void *v)
 	return 0;
 }
 
+static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq,
+						 struct cgroup *cgrp, int ssid)
+{
+	struct cgroup_subsys *ss = cgroup_subsys[ssid];
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	if (!ss->css_extra_stat_show)
+		return 0;
+
+	css = cgroup_tryget_css(cgrp, ss);
+	if (!css)
+		return 0;
+
+	ret = ss->css_extra_stat_show(seq, css);
+	css_put(css);
+	return ret;
+}
+
+static int cpu_stat_show(struct seq_file *seq, void *v)
+{
+	struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup;
+	int ret = 0;
+
+	cgroup_stat_show_cputime(seq);
+#ifdef CONFIG_CGROUP_SCHED
+	ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id);
+#endif
+	return ret;
+}
+
 static int cgroup_file_open(struct kernfs_open_file *of)
 {
 	struct cftype *cft = of->kn->priv;
@@ -4419,6 +4485,11 @@ static struct cftype cgroup_base_files[] = {
 		.name = "cgroup.stat",
 		.seq_show = cgroup_stat_show,
 	},
+	{
+		.name = "cpu.stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_stat_show,
+	},
 	{ }	/* terminate */
 };
 
@@ -4479,6 +4550,8 @@ static void css_free_work_fn(struct work_struct *work)
 			 */
 			cgroup_put(cgroup_parent(cgrp));
 			kernfs_put(cgrp->kn);
+			if (cgroup_on_dfl(cgrp))
+				cgroup_stat_exit(cgrp);
 			kfree(cgrp);
 		} else {
 			/*
@@ -4523,6 +4596,9 @@ static void css_release_work_fn(struct work_struct *work)
 		/* cgroup release path */
 		trace_cgroup_release(cgrp);
 
+		if (cgroup_on_dfl(cgrp))
+			cgroup_stat_flush(cgrp);
+
 		for (tcgrp = cgroup_parent(cgrp); tcgrp;
 		     tcgrp = cgroup_parent(tcgrp))
 			tcgrp->nr_dying_descendants--;
@@ -4706,6 +4782,12 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (ret)
 		goto out_free_cgrp;
 
+	if (cgroup_on_dfl(parent)) {
+		ret = cgroup_stat_init(cgrp);
+		if (ret)
+			goto out_cancel_ref;
+	}
+
 	/*
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
@@ -4713,7 +4795,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
-		goto out_cancel_ref;
+		goto out_stat_exit;
 	}
 
 	init_cgroup_housekeeping(cgrp);
@@ -4721,6 +4803,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	cgrp->self.parent = &parent->self;
 	cgrp->root = root;
 	cgrp->level = level;
+	ret = cgroup_bpf_inherit(cgrp);
+	if (ret)
+		goto out_idr_free;
 
 	for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
 		cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
@@ -4755,13 +4840,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
-	if (parent)
-		cgroup_bpf_inherit(cgrp, parent);
-
 	cgroup_propagate_control(cgrp);
 
 	return cgrp;
 
+out_idr_free:
+	cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
+out_stat_exit:
+	if (cgroup_on_dfl(parent))
+		cgroup_stat_exit(cgrp);
 out_cancel_ref:
 	percpu_ref_exit(&cgrp->self.refcnt);
 out_free_cgrp:
@@ -5156,6 +5243,8 @@ int __init cgroup_init(void)
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
 
+	cgroup_stat_boot();
+
 	/*
 	 * The latency of the synchronize_sched() is too high for cgroups,
 	 * avoid it at the cost of forcing all readers into the slow path.
@@ -5744,15 +5833,103 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
 #endif	/* CONFIG_SOCK_CGROUP_DATA */
 
 #ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_update(struct cgroup *cgrp, struct bpf_prog *prog,
-		      enum bpf_attach_type type, bool overridable)
+int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
 {
-	struct cgroup *parent = cgroup_parent(cgrp);
 	int ret;
 
 	mutex_lock(&cgroup_mutex);
-	ret = __cgroup_bpf_update(cgrp, parent, prog, type, overridable);
+	ret = __cgroup_bpf_attach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+		      enum bpf_attach_type type, u32 flags)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_detach(cgrp, prog, type, flags);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+		     union bpf_attr __user *uattr)
+{
+	int ret;
+
+	mutex_lock(&cgroup_mutex);
+	ret = __cgroup_bpf_query(cgrp, attr, uattr);
 	mutex_unlock(&cgroup_mutex);
 	return ret;
 }
 #endif /* CONFIG_CGROUP_BPF */
+
+#ifdef CONFIG_SYSFS
+static ssize_t show_delegatable_files(struct cftype *files, char *buf,
+				      ssize_t size, const char *prefix)
+{
+	struct cftype *cft;
+	ssize_t ret = 0;
+
+	for (cft = files; cft && cft->name[0] != '\0'; cft++) {
+		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
+			continue;
+
+		if (prefix)
+			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
+
+		ret += snprintf(buf + ret, size - ret, "%s\n", cft->name);
+
+		if (unlikely(ret >= size)) {
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *buf)
+{
+	struct cgroup_subsys *ss;
+	int ssid;
+	ssize_t ret = 0;
+
+	ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
+				     NULL);
+
+	for_each_subsys(ss, ssid)
+		ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
+					      PAGE_SIZE - ret,
+					      cgroup_subsys_name[ssid]);
+
+	return ret;
+}
+static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
+
+static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
+			     char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "nsdelegate\n");
+}
+static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
+
+static struct attribute *cgroup_sysfs_attrs[] = {
+	&cgroup_delegate_attr.attr,
+	&cgroup_features_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group cgroup_sysfs_attr_group = {
+	.attrs = cgroup_sysfs_attrs,
+	.name = "cgroup",
+};
+
+static int __init cgroup_sysfs_init(void)
+{
+	return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group);
+}
+subsys_initcall(cgroup_sysfs_init);
+#endif /* CONFIG_SYSFS */
diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c
new file mode 100644
index 000000000000..133b465691d6
--- /dev/null
+++ b/kernel/cgroup/stat.c
@@ -0,0 +1,334 @@
+#include "cgroup-internal.h"
+
+#include <linux/sched/cputime.h>
+
+static DEFINE_MUTEX(cgroup_stat_mutex);
+static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock);
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgrp->cpu_stat, cpu);
+}
+
+/**
+ * cgroup_cpu_stat_updated - keep track of updated cpu_stat
+ * @cgrp: target cgroup
+ * @cpu: cpu on which cpu_stat was updated
+ *
+ * @cgrp's cpu_stat on @cpu was updated.  Put it on the parent's matching
+ * cpu_stat->updated_children list.  See the comment on top of
+ * cgroup_cpu_stat definition for details.
+ */
+static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu)
+{
+	raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+	struct cgroup *parent;
+	unsigned long flags;
+
+	/*
+	 * Speculative already-on-list test.  This may race leading to
+	 * temporary inaccuracies, which is fine.
+	 *
+	 * Because @parent's updated_children is terminated with @parent
+	 * instead of NULL, we can tell whether @cgrp is on the list by
+	 * testing the next pointer for NULL.
+	 */
+	if (cgroup_cpu_stat(cgrp, cpu)->updated_next)
+		return;
+
+	raw_spin_lock_irqsave(cpu_lock, flags);
+
+	/* put @cgrp and all ancestors on the corresponding updated lists */
+	for (parent = cgroup_parent(cgrp); parent;
+	     cgrp = parent, parent = cgroup_parent(cgrp)) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+
+		/*
+		 * Both additions and removals are bottom-up.  If a cgroup
+		 * is already in the tree, all ancestors are.
+		 */
+		if (cstat->updated_next)
+			break;
+
+		cstat->updated_next = pcstat->updated_children;
+		pcstat->updated_children = cgrp;
+	}
+
+	raw_spin_unlock_irqrestore(cpu_lock, flags);
+}
+
+/**
+ * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree
+ * @pos: current position
+ * @root: root of the tree to traversal
+ * @cpu: target cpu
+ *
+ * Walks the udpated cpu_stat tree on @cpu from @root.  %NULL @pos starts
+ * the traversal and %NULL return indicates the end.  During traversal,
+ * each returned cgroup is unlinked from the tree.  Must be called with the
+ * matching cgroup_cpu_stat_lock held.
+ *
+ * The only ordering guarantee is that, for a parent and a child pair
+ * covered by a given traversal, if a child is visited, its parent is
+ * guaranteed to be visited afterwards.
+ */
+static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos,
+						  struct cgroup *root, int cpu)
+{
+	struct cgroup_cpu_stat *cstat;
+	struct cgroup *parent;
+
+	if (pos == root)
+		return NULL;
+
+	/*
+	 * We're gonna walk down to the first leaf and visit/remove it.  We
+	 * can pick whatever unvisited node as the starting point.
+	 */
+	if (!pos)
+		pos = root;
+	else
+		pos = cgroup_parent(pos);
+
+	/* walk down to the first leaf */
+	while (true) {
+		cstat = cgroup_cpu_stat(pos, cpu);
+		if (cstat->updated_children == pos)
+			break;
+		pos = cstat->updated_children;
+	}
+
+	/*
+	 * Unlink @pos from the tree.  As the updated_children list is
+	 * singly linked, we have to walk it to find the removal point.
+	 * However, due to the way we traverse, @pos will be the first
+	 * child in most cases. The only exception is @root.
+	 */
+	parent = cgroup_parent(pos);
+	if (parent && cstat->updated_next) {
+		struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu);
+		struct cgroup_cpu_stat *ncstat;
+		struct cgroup **nextp;
+
+		nextp = &pcstat->updated_children;
+		while (true) {
+			ncstat = cgroup_cpu_stat(*nextp, cpu);
+			if (*nextp == pos)
+				break;
+
+			WARN_ON_ONCE(*nextp == parent);
+			nextp = &ncstat->updated_next;
+		}
+
+		*nextp = cstat->updated_next;
+		cstat->updated_next = NULL;
+	}
+
+	return pos;
+}
+
+static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat,
+				   struct cgroup_stat *src_stat)
+{
+	dst_stat->cputime.utime += src_stat->cputime.utime;
+	dst_stat->cputime.stime += src_stat->cputime.stime;
+	dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime;
+}
+
+static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+	struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+	struct task_cputime *last_cputime = &cstat->last_cputime;
+	struct task_cputime cputime;
+	struct cgroup_stat delta;
+	unsigned seq;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	/* fetch the current per-cpu values */
+	do {
+		seq = __u64_stats_fetch_begin(&cstat->sync);
+		cputime = cstat->cputime;
+	} while (__u64_stats_fetch_retry(&cstat->sync, seq));
+
+	/* accumulate the deltas to propgate */
+	delta.cputime.utime = cputime.utime - last_cputime->utime;
+	delta.cputime.stime = cputime.stime - last_cputime->stime;
+	delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime -
+					 last_cputime->sum_exec_runtime;
+	*last_cputime = cputime;
+
+	/* transfer the pending stat into delta */
+	cgroup_stat_accumulate(&delta, &cgrp->pending_stat);
+	memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat));
+
+	/* propagate delta into the global stat and the parent's pending */
+	cgroup_stat_accumulate(&cgrp->stat, &delta);
+	if (parent)
+		cgroup_stat_accumulate(&parent->pending_stat, &delta);
+}
+
+/* see cgroup_stat_flush() */
+static void cgroup_stat_flush_locked(struct cgroup *cgrp)
+{
+	int cpu;
+
+	lockdep_assert_held(&cgroup_stat_mutex);
+
+	for_each_possible_cpu(cpu) {
+		raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu);
+		struct cgroup *pos = NULL;
+
+		raw_spin_lock_irq(cpu_lock);
+		while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu)))
+			cgroup_cpu_stat_flush_one(pos, cpu);
+		raw_spin_unlock_irq(cpu_lock);
+	}
+}
+
+/**
+ * cgroup_stat_flush - flush stats in @cgrp's subtree
+ * @cgrp: target cgroup
+ *
+ * Collect all per-cpu stats in @cgrp's subtree into the global counters
+ * and propagate them upwards.  After this function returns, all cgroups in
+ * the subtree have up-to-date ->stat.
+ *
+ * This also gets all cgroups in the subtree including @cgrp off the
+ * ->updated_children lists.
+ */
+void cgroup_stat_flush(struct cgroup *cgrp)
+{
+	mutex_lock(&cgroup_stat_mutex);
+	cgroup_stat_flush_locked(cgrp);
+	mutex_unlock(&cgroup_stat_mutex);
+}
+
+static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = get_cpu_ptr(cgrp->cpu_stat);
+	u64_stats_update_begin(&cstat->sync);
+	return cstat;
+}
+
+static void cgroup_cpu_stat_account_end(struct cgroup *cgrp,
+					struct cgroup_cpu_stat *cstat)
+{
+	u64_stats_update_end(&cstat->sync);
+	cgroup_cpu_stat_updated(cgrp, smp_processor_id());
+	put_cpu_ptr(cstat);
+}
+
+void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+	cstat->cputime.sum_exec_runtime += delta_exec;
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void __cgroup_account_cputime_field(struct cgroup *cgrp,
+				    enum cpu_usage_stat index, u64 delta_exec)
+{
+	struct cgroup_cpu_stat *cstat;
+
+	cstat = cgroup_cpu_stat_account_begin(cgrp);
+
+	switch (index) {
+	case CPUTIME_USER:
+	case CPUTIME_NICE:
+		cstat->cputime.utime += delta_exec;
+		break;
+	case CPUTIME_SYSTEM:
+	case CPUTIME_IRQ:
+	case CPUTIME_SOFTIRQ:
+		cstat->cputime.stime += delta_exec;
+		break;
+	default:
+		break;
+	}
+
+	cgroup_cpu_stat_account_end(cgrp, cstat);
+}
+
+void cgroup_stat_show_cputime(struct seq_file *seq)
+{
+	struct cgroup *cgrp = seq_css(seq)->cgroup;
+	u64 usage, utime, stime;
+
+	if (!cgroup_parent(cgrp))
+		return;
+
+	mutex_lock(&cgroup_stat_mutex);
+
+	cgroup_stat_flush_locked(cgrp);
+
+	usage = cgrp->stat.cputime.sum_exec_runtime;
+	cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime,
+		       &utime, &stime);
+
+	mutex_unlock(&cgroup_stat_mutex);
+
+	do_div(usage, NSEC_PER_USEC);
+	do_div(utime, NSEC_PER_USEC);
+	do_div(stime, NSEC_PER_USEC);
+
+	seq_printf(seq, "usage_usec %llu\n"
+		   "user_usec %llu\n"
+		   "system_usec %llu\n",
+		   usage, utime, stime);
+}
+
+int cgroup_stat_init(struct cgroup *cgrp)
+{
+	int cpu;
+
+	/* the root cgrp has cpu_stat preallocated */
+	if (!cgrp->cpu_stat) {
+		cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat);
+		if (!cgrp->cpu_stat)
+			return -ENOMEM;
+	}
+
+	/* ->updated_children list is self terminated */
+	for_each_possible_cpu(cpu)
+		cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp;
+
+	prev_cputime_init(&cgrp->stat.prev_cputime);
+
+	return 0;
+}
+
+void cgroup_stat_exit(struct cgroup *cgrp)
+{
+	int cpu;
+
+	cgroup_stat_flush(cgrp);
+
+	/* sanity check */
+	for_each_possible_cpu(cpu) {
+		struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu);
+
+		if (WARN_ON_ONCE(cstat->updated_children != cgrp) ||
+		    WARN_ON_ONCE(cstat->updated_next))
+			return;
+	}
+
+	free_percpu(cgrp->cpu_stat);
+	cgrp->cpu_stat = NULL;
+}
+
+void __init cgroup_stat_boot(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu));
+
+	BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp));
+}
diff --git a/kernel/compat.c b/kernel/compat.c
index 772e038d04d9..d1cee656a7ed 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -367,24 +367,6 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t,  pid, unsigned int, len,
 	return ret;
 }
 
-int get_compat_itimerspec(struct itimerspec *dst,
-			  const struct compat_itimerspec __user *src)
-{
-	if (__compat_get_timespec(&dst->it_interval, &src->it_interval) ||
-	    __compat_get_timespec(&dst->it_value, &src->it_value))
-		return -EFAULT;
-	return 0;
-}
-
-int put_compat_itimerspec(struct compat_itimerspec __user *dst,
-			  const struct itimerspec *src)
-{
-	if (__compat_put_timespec(&src->it_interval, &dst->it_interval) ||
-	    __compat_put_timespec(&src->it_value, &dst->it_value))
-		return -EFAULT;
-	return 0;
-}
-
 int get_compat_itimerspec64(struct itimerspec64 *its,
 			const struct compat_itimerspec __user *uits)
 {
@@ -485,27 +467,44 @@ Efault:
 	return -EFAULT;
 }
 
-void
-sigset_from_compat(sigset_t *set, const compat_sigset_t *compat)
+int
+get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat)
 {
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
+	if (copy_from_user(&v, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
 	switch (_NSIG_WORDS) {
-	case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
-	case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
-	case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
-	case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
+	case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 );
+	case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 );
+	case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 );
+	case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 );
 	}
+#else
+	if (copy_from_user(set, compat, sizeof(compat_sigset_t)))
+		return -EFAULT;
+#endif
+	return 0;
 }
-EXPORT_SYMBOL_GPL(sigset_from_compat);
+EXPORT_SYMBOL_GPL(get_compat_sigset);
 
-void
-sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
+int
+put_compat_sigset(compat_sigset_t __user *compat, const sigset_t *set,
+		  unsigned int size)
 {
+	/* size <= sizeof(compat_sigset_t) <= sizeof(sigset_t) */
+#ifdef __BIG_ENDIAN
+	compat_sigset_t v;
 	switch (_NSIG_WORDS) {
-	case 4: compat->sig[7] = (set->sig[3] >> 32); compat->sig[6] = set->sig[3];
-	case 3: compat->sig[5] = (set->sig[2] >> 32); compat->sig[4] = set->sig[2];
-	case 2: compat->sig[3] = (set->sig[1] >> 32); compat->sig[2] = set->sig[1];
-	case 1: compat->sig[1] = (set->sig[0] >> 32); compat->sig[0] = set->sig[0];
+	case 4: v.sig[7] = (set->sig[3] >> 32); v.sig[6] = set->sig[3];
+	case 3: v.sig[5] = (set->sig[2] >> 32); v.sig[4] = set->sig[2];
+	case 2: v.sig[3] = (set->sig[1] >> 32); v.sig[2] = set->sig[1];
+	case 1: v.sig[1] = (set->sig[0] >> 32); v.sig[0] = set->sig[0];
 	}
+	return copy_to_user(compat, &v, size) ? -EFAULT : 0;
+#else
+	return copy_to_user(compat, set, size) ? -EFAULT : 0;
+#endif
 }
 
 #ifdef CONFIG_NUMA
@@ -563,22 +562,6 @@ COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid,
 }
 #endif
 
-COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
-		       compat_pid_t, pid,
-		       struct compat_timespec __user *, interval)
-{
-	struct timespec t;
-	int ret;
-	mm_segment_t old_fs = get_fs();
-
-	set_fs(KERNEL_DS);
-	ret = sys_sched_rr_get_interval(pid, (struct timespec __user *)&t);
-	set_fs(old_fs);
-	if (compat_put_timespec(&t, interval))
-		return -EFAULT;
-	return ret;
-}
-
 /*
  * Allocate user-space memory for the duration of a single system call,
  * in order to marshall parameters inside a compat thunk.
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index 6db80fc0810b..b3663896278e 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -108,7 +108,8 @@ static int __init parse_crashkernel_mem(char *cmdline,
 				return -EINVAL;
 			}
 		}
-	}
+	} else
+		pr_info("crashkernel size resulted in zero bytes\n");
 
 	return 0;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 494eca1bc760..5961ef6dfd64 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3601,7 +3601,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value,
 		goto out;
 	}
 
-
 	/*
 	 * If the event is currently on this CPU, its either a per-task event,
 	 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
@@ -7871,25 +7870,24 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
 			       struct pt_regs *regs, struct hlist_head *head,
 			       struct task_struct *task)
 {
-	struct bpf_prog *prog = call->prog;
-
-	if (prog) {
+	if (bpf_prog_array_valid(call)) {
 		*(struct pt_regs **)raw_data = regs;
-		if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
+		if (!trace_call_bpf(call, raw_data) || hlist_empty(head)) {
 			perf_swevent_put_recursion_context(rctx);
 			return;
 		}
 	}
 	perf_tp_event(call->event.type, count, raw_data, size, regs, head,
-		      rctx, task, NULL);
+		      rctx, task);
 }
 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
 
 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 		   struct pt_regs *regs, struct hlist_head *head, int rctx,
-		   struct task_struct *task, struct perf_event *event)
+		   struct task_struct *task)
 {
 	struct perf_sample_data data;
+	struct perf_event *event;
 
 	struct perf_raw_record raw = {
 		.frag = {
@@ -7903,15 +7901,9 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
 
 	perf_trace_buf_update(record, event_type);
 
-	/* Use the given event instead of the hlist */
-	if (event) {
+	hlist_for_each_entry_rcu(event, head, hlist_entry) {
 		if (perf_tp_event_match(event, &data, regs))
 			perf_swevent_event(event, count, &data, regs);
-	} else {
-		hlist_for_each_entry_rcu(event, head, hlist_entry) {
-			if (perf_tp_event_match(event, &data, regs))
-				perf_swevent_event(event, count, &data, regs);
-		}
 	}
 
 	/*
@@ -8064,13 +8056,11 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 {
 	bool is_kprobe, is_tracepoint, is_syscall_tp;
 	struct bpf_prog *prog;
+	int ret;
 
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return perf_event_set_bpf_handler(event, prog_fd);
 
-	if (event->tp_event->prog)
-		return -EEXIST;
-
 	is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
 	is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
 	is_syscall_tp = is_syscall_trace_event(event->tp_event);
@@ -8098,26 +8088,20 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
 			return -EACCES;
 		}
 	}
-	event->tp_event->prog = prog;
-	event->tp_event->bpf_prog_owner = event;
 
-	return 0;
+	ret = perf_event_attach_bpf_prog(event, prog);
+	if (ret)
+		bpf_prog_put(prog);
+	return ret;
 }
 
 static void perf_event_free_bpf_prog(struct perf_event *event)
 {
-	struct bpf_prog *prog;
-
-	perf_event_free_bpf_handler(event);
-
-	if (!event->tp_event)
+	if (event->attr.type != PERF_TYPE_TRACEPOINT) {
+		perf_event_free_bpf_handler(event);
 		return;
-
-	prog = event->tp_event->prog;
-	if (prog && event->tp_event->bpf_prog_owner == event) {
-		event->tp_event->prog = NULL;
-		bpf_prog_put(prog);
 	}
+	perf_event_detach_bpf_prog(event);
 }
 
 #else
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index f3e37971c842..141aa2ca8728 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -411,6 +411,7 @@ err:
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_begin);
 
 static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb)
 {
@@ -480,6 +481,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
 	rb_free_aux(rb);
 	ring_buffer_put(rb);
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_end);
 
 /*
  * Skip over a given number of bytes in the AUX buffer, due to, for example,
@@ -505,6 +507,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(perf_aux_output_skip);
 
 void *perf_get_aux(struct perf_output_handle *handle)
 {
@@ -514,6 +517,7 @@ void *perf_get_aux(struct perf_output_handle *handle)
 
 	return handle->rb->aux_priv;
 }
+EXPORT_SYMBOL_GPL(perf_get_aux);
 
 #define PERF_AUX_GFP	(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY)
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3..432eadf6b58c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -469,7 +469,7 @@ void __init fork_init(void)
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep = kmem_cache_create("task_struct",
 			arch_task_struct_size, align,
-			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
+			SLAB_PANIC|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -817,8 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	init_rwsem(&mm->mmap_sem);
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_state = NULL;
-	atomic_long_set(&mm->nr_ptes, 0);
-	mm_nr_pmds_init(mm);
+	mm_pgtables_bytes_init(mm);
 	mm->map_count = 0;
 	mm->locked_vm = 0;
 	mm->pinned_vm = 0;
@@ -872,12 +871,9 @@ static void check_mm(struct mm_struct *mm)
 					  "mm:%p idx:%d val:%ld\n", mm, i, x);
 	}
 
-	if (atomic_long_read(&mm->nr_ptes))
-		pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n",
-				atomic_long_read(&mm->nr_ptes));
-	if (mm_nr_pmds(mm))
-		pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
-				mm_nr_pmds(mm));
+	if (mm_pgtables_bytes(mm))
+		pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n",
+				mm_pgtables_bytes(mm));
 
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
@@ -1875,7 +1871,7 @@ static __latent_entropy struct task_struct *copy_process(
 		retval = -ERESTARTNOINTR;
 		goto bad_fork_cancel_cgroup;
 	}
-	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+	if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
 		retval = -ENOMEM;
 		goto bad_fork_cancel_cgroup;
 	}
@@ -2209,18 +2205,18 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
-			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
+			SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
@@ -2231,7 +2227,7 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index f2edcf85780d..49b54e9979cc 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -862,6 +862,7 @@ int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition);
 
 void kstat_incr_irq_this_cpu(unsigned int irq)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2ff1c0c82fc9..0f922729bab9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1246,7 +1246,18 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		 * set the trigger type must match. Also all must
 		 * agree on ONESHOT.
 		 */
-		unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data);
+		unsigned int oldtype;
+
+		/*
+		 * If nobody did set the configuration before, inherit
+		 * the one provided by the requester.
+		 */
+		if (irqd_trigger_type_was_set(&desc->irq_data)) {
+			oldtype = irqd_get_trigger_type(&desc->irq_data);
+		} else {
+			oldtype = new->flags & IRQF_TRIGGER_MASK;
+			irqd_set_trigger_type(&desc->irq_data, oldtype);
+		}
 
 		if (!((old->flags & new->flags) & IRQF_SHARED) ||
 		    (oldtype != (new->flags & IRQF_TRIGGER_MASK)) ||
diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c
index a3cbbc8191c5..7df2480005f8 100644
--- a/kernel/irq/matrix.c
+++ b/kernel/irq/matrix.c
@@ -384,7 +384,7 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown)
 {
 	struct cpumap *cm = this_cpu_ptr(m->maps);
 
-	return m->global_available - cpudown ? cm->available : 0;
+	return (m->global_available - cpudown) ? cm->available : 0;
 }
 
 /**
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 1215229d1c12..ef2a47e0eab6 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -20,7 +20,7 @@
 static int irqfixup __read_mostly;
 
 #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
-static void poll_spurious_irqs(unsigned long dummy);
+static void poll_spurious_irqs(struct timer_list *unused);
 static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs);
 static int irq_poll_cpu;
 static atomic_t irq_poll_active;
@@ -143,7 +143,7 @@ out:
 	return ok;
 }
 
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_spurious_irqs(struct timer_list *unused)
 {
 	struct irq_desc *desc;
 	int i;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8ff4ca4665ff..8594d24e4adc 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -769,7 +769,7 @@ static __init int jump_label_test(void)
 
 	return 0;
 }
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
 #endif /* STATIC_KEYS_SELFTEST */
 
 #endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 1e6ae66c6244..d5fa4116688a 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 #include <linux/slab.h>
 #include <linux/filter.h>
+#include <linux/ftrace.h>
 #include <linux/compiler.h>
 
 #include <asm/sections.h>
@@ -337,6 +338,10 @@ const char *kallsyms_lookup(unsigned long addr,
 	if (!ret)
 		ret = bpf_address_lookup(addr, symbolsize,
 					 offset, modname, namebuf);
+
+	if (!ret)
+		ret = ftrace_mod_address_lookup(addr, symbolsize,
+						offset, modname, namebuf);
 	return ret;
 }
 
@@ -474,6 +479,7 @@ EXPORT_SYMBOL(__print_symbol);
 struct kallsym_iter {
 	loff_t pos;
 	loff_t pos_mod_end;
+	loff_t pos_ftrace_mod_end;
 	unsigned long value;
 	unsigned int nameoff; /* If iterating in core kernel symbols. */
 	char type;
@@ -497,11 +503,25 @@ static int get_ksymbol_mod(struct kallsym_iter *iter)
 	return 1;
 }
 
+static int get_ksymbol_ftrace_mod(struct kallsym_iter *iter)
+{
+	int ret = ftrace_mod_get_kallsym(iter->pos - iter->pos_mod_end,
+					 &iter->value, &iter->type,
+					 iter->name, iter->module_name,
+					 &iter->exported);
+	if (ret < 0) {
+		iter->pos_ftrace_mod_end = iter->pos;
+		return 0;
+	}
+
+	return 1;
+}
+
 static int get_ksymbol_bpf(struct kallsym_iter *iter)
 {
 	iter->module_name[0] = '\0';
 	iter->exported = 0;
-	return bpf_get_kallsym(iter->pos - iter->pos_mod_end,
+	return bpf_get_kallsym(iter->pos - iter->pos_ftrace_mod_end,
 			       &iter->value, &iter->type,
 			       iter->name) < 0 ? 0 : 1;
 }
@@ -526,20 +546,31 @@ static void reset_iter(struct kallsym_iter *iter, loff_t new_pos)
 	iter->name[0] = '\0';
 	iter->nameoff = get_symbol_offset(new_pos);
 	iter->pos = new_pos;
-	if (new_pos == 0)
+	if (new_pos == 0) {
 		iter->pos_mod_end = 0;
+		iter->pos_ftrace_mod_end = 0;
+	}
 }
 
 static int update_iter_mod(struct kallsym_iter *iter, loff_t pos)
 {
 	iter->pos = pos;
 
-	if (iter->pos_mod_end > 0 &&
-	    iter->pos_mod_end < iter->pos)
+	if (iter->pos_ftrace_mod_end > 0 &&
+	    iter->pos_ftrace_mod_end < iter->pos)
 		return get_ksymbol_bpf(iter);
 
-	if (!get_ksymbol_mod(iter))
-		return get_ksymbol_bpf(iter);
+	if (iter->pos_mod_end > 0 &&
+	    iter->pos_mod_end < iter->pos) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+		return 1;
+	}
+
+	if (!get_ksymbol_mod(iter)) {
+		if (!get_ksymbol_ftrace_mod(iter))
+			return get_ksymbol_bpf(iter);
+	}
 
 	return 1;
 }
@@ -583,14 +614,14 @@ static void s_stop(struct seq_file *m, void *p)
 
 static int s_show(struct seq_file *m, void *p)
 {
-	unsigned long value;
+	void *value;
 	struct kallsym_iter *iter = m->private;
 
 	/* Some debugging symbols have no name.  Ignore them. */
 	if (!iter->name[0])
 		return 0;
 
-	value = iter->show_value ? iter->value : 0;
+	value = iter->show_value ? (void *)iter->value : NULL;
 
 	if (iter->module_name[0]) {
 		char type;
@@ -601,10 +632,10 @@ static int s_show(struct seq_file *m, void *p)
 		 */
 		type = iter->exported ? toupper(iter->type) :
 					tolower(iter->type);
-		seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value,
+		seq_printf(m, "%px %c %s\t[%s]\n", value,
 			   type, iter->name, iter->module_name);
 	} else
-		seq_printf(m, KALLSYM_FMT " %c %s\n", value,
+		seq_printf(m, "%px %c %s\n", value,
 			   iter->type, iter->name);
 	return 0;
 }
diff --git a/kernel/kcov.c b/kernel/kcov.c
index fc6af9e1308b..15f33faf4013 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -22,13 +22,21 @@
 #include <linux/kcov.h>
 #include <asm/setup.h>
 
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
 /*
  * kcov descriptor (one per opened debugfs file).
  * State transitions of the descriptor:
  *  - initial state after open()
  *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
  *  - then, mmap() call (several calls are allowed but not useful)
- *  - then, repeated enable/disable for a task (only one task a time allowed)
+ *  - then, ioctl(KCOV_ENABLE, arg), where arg is
+ *	KCOV_TRACE_PC - to trace only the PCs
+ *	or
+ *	KCOV_TRACE_CMP - to trace only the comparison operands
+ *  - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
  */
 struct kcov {
 	/*
@@ -48,51 +56,176 @@ struct kcov {
 	struct task_struct	*t;
 };
 
-/*
- * Entry point from instrumented code.
- * This is called once per basic-block/edge.
- */
-void notrace __sanitizer_cov_trace_pc(void)
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
 {
-	struct task_struct *t;
 	enum kcov_mode mode;
 
-	t = current;
 	/*
 	 * We are interested in code coverage as a function of a syscall inputs,
 	 * so we ignore code executed in interrupts.
 	 */
-	if (!t || !in_task())
-		return;
+	if (!in_task())
+		return false;
 	mode = READ_ONCE(t->kcov_mode);
-	if (mode == KCOV_MODE_TRACE) {
-		unsigned long *area;
-		unsigned long pos;
-		unsigned long ip = _RET_IP_;
+	/*
+	 * There is some code that runs in interrupts but for which
+	 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+	 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+	 * interrupts, there are paired barrier()/WRITE_ONCE() in
+	 * kcov_ioctl_locked().
+	 */
+	barrier();
+	return mode == needed_mode;
+}
 
+static unsigned long canonicalize_ip(unsigned long ip)
+{
 #ifdef CONFIG_RANDOMIZE_BASE
-		ip -= kaslr_offset();
+	ip -= kaslr_offset();
 #endif
+	return ip;
+}
 
-		/*
-		 * There is some code that runs in interrupts but for which
-		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
-		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
-		 * interrupts, there are paired barrier()/WRITE_ONCE() in
-		 * kcov_ioctl_locked().
-		 */
-		barrier();
-		area = t->kcov_area;
-		/* The first word is number of subsequent PCs. */
-		pos = READ_ONCE(area[0]) + 1;
-		if (likely(pos < t->kcov_size)) {
-			area[pos] = ip;
-			WRITE_ONCE(area[0], pos);
-		}
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	unsigned long *area;
+	unsigned long ip = canonicalize_ip(_RET_IP_);
+	unsigned long pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+		return;
+
+	area = t->kcov_area;
+	/* The first 64-bit word is the number of subsequent PCs. */
+	pos = READ_ONCE(area[0]) + 1;
+	if (likely(pos < t->kcov_size)) {
+		area[pos] = ip;
+		WRITE_ONCE(area[0], pos);
 	}
 }
 EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
 
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+	struct task_struct *t;
+	u64 *area;
+	u64 count, start_index, end_pos, max_pos;
+
+	t = current;
+	if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+		return;
+
+	ip = canonicalize_ip(ip);
+
+	/*
+	 * We write all comparison arguments and types as u64.
+	 * The buffer was allocated for t->kcov_size unsigned longs.
+	 */
+	area = (u64 *)t->kcov_area;
+	max_pos = t->kcov_size * sizeof(unsigned long);
+
+	count = READ_ONCE(area[0]);
+
+	/* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+	start_index = 1 + count * KCOV_WORDS_PER_CMP;
+	end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+	if (likely(end_pos <= max_pos)) {
+		area[start_index] = type;
+		area[start_index + 1] = arg1;
+		area[start_index + 2] = arg2;
+		area[start_index + 3] = ip;
+		WRITE_ONCE(area[0], count + 1);
+	}
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+	write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+			_RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+	u64 i;
+	u64 count = cases[0];
+	u64 size = cases[1];
+	u64 type = KCOV_CMP_CONST;
+
+	switch (size) {
+	case 8:
+		type |= KCOV_CMP_SIZE(0);
+		break;
+	case 16:
+		type |= KCOV_CMP_SIZE(1);
+		break;
+	case 32:
+		type |= KCOV_CMP_SIZE(2);
+		break;
+	case 64:
+		type |= KCOV_CMP_SIZE(3);
+		break;
+	default:
+		return;
+	}
+	for (i = 0; i < count; i++)
+		write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
 static void kcov_get(struct kcov *kcov)
 {
 	atomic_inc(&kcov->refcount);
@@ -129,6 +262,7 @@ void kcov_task_exit(struct task_struct *t)
 	/* Just to not leave dangling references behind. */
 	kcov_task_init(t);
 	kcov->t = NULL;
+	kcov->mode = KCOV_MODE_INIT;
 	spin_unlock(&kcov->lock);
 	kcov_put(kcov);
 }
@@ -147,7 +281,7 @@ static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
 
 	spin_lock(&kcov->lock);
 	size = kcov->size * sizeof(unsigned long);
-	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
 	    vma->vm_end - vma->vm_start != size) {
 		res = -EINVAL;
 		goto exit;
@@ -176,6 +310,7 @@ static int kcov_open(struct inode *inode, struct file *filep)
 	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
 	if (!kcov)
 		return -ENOMEM;
+	kcov->mode = KCOV_MODE_DISABLED;
 	atomic_set(&kcov->refcount, 1);
 	spin_lock_init(&kcov->lock);
 	filep->private_data = kcov;
@@ -211,7 +346,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
 			return -EINVAL;
 		kcov->size = size;
-		kcov->mode = KCOV_MODE_TRACE;
+		kcov->mode = KCOV_MODE_INIT;
 		return 0;
 	case KCOV_ENABLE:
 		/*
@@ -221,17 +356,25 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 		 * at task exit or voluntary by KCOV_DISABLE. After that it can
 		 * be enabled for another task.
 		 */
-		unused = arg;
-		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
-		    kcov->area == NULL)
+		if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
 			return -EINVAL;
 		if (kcov->t != NULL)
 			return -EBUSY;
+		if (arg == KCOV_TRACE_PC)
+			kcov->mode = KCOV_MODE_TRACE_PC;
+		else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+			kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+		return -ENOTSUPP;
+#endif
+		else
+			return -EINVAL;
 		t = current;
 		/* Cache in task struct for performance. */
 		t->kcov_size = kcov->size;
 		t->kcov_area = kcov->area;
-		/* See comment in __sanitizer_cov_trace_pc(). */
+		/* See comment in check_kcov_mode(). */
 		barrier();
 		WRITE_ONCE(t->kcov_mode, kcov->mode);
 		t->kcov = kcov;
@@ -249,6 +392,7 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
 			return -EINVAL;
 		kcov_task_init(t);
 		kcov->t = NULL;
+		kcov->mode = KCOV_MODE_INIT;
 		kcov_put(kcov);
 		return 0;
 	default:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ba3992c8c375..cd50e99202b0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -20,7 +20,6 @@
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
 #include <linux/uaccess.h>
-#include <linux/cgroup.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -47,6 +46,9 @@ struct kthread {
 	void *data;
 	struct completion parked;
 	struct completion exited;
+#ifdef CONFIG_BLK_CGROUP
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 };
 
 enum KTHREAD_BITS {
@@ -74,11 +76,17 @@ static inline struct kthread *to_kthread(struct task_struct *k)
 
 void free_kthread_struct(struct task_struct *k)
 {
+	struct kthread *kthread;
+
 	/*
 	 * Can be NULL if this kthread was created by kernel_thread()
 	 * or if kmalloc() in kthread() failed.
 	 */
-	kfree(to_kthread(k));
+	kthread = to_kthread(k);
+#ifdef CONFIG_BLK_CGROUP
+	WARN_ON_ONCE(kthread && kthread->blkcg_css);
+#endif
+	kfree(kthread);
 }
 
 /**
@@ -196,7 +204,7 @@ static int kthread(void *_create)
 	struct kthread *self;
 	int ret;
 
-	self = kmalloc(sizeof(*self), GFP_KERNEL);
+	self = kzalloc(sizeof(*self), GFP_KERNEL);
 	set_kthread_struct(self);
 
 	/* If user was SIGKILLed, I release the structure. */
@@ -212,7 +220,6 @@ static int kthread(void *_create)
 		do_exit(-ENOMEM);
 	}
 
-	self->flags = 0;
 	self->data = data;
 	init_completion(&self->exited);
 	init_completion(&self->parked);
@@ -836,7 +843,7 @@ void __kthread_queue_delayed_work(struct kthread_worker *worker,
 	struct timer_list *timer = &dwork->timer;
 	struct kthread_work *work = &dwork->work;
 
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)kthread_delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn);
 
 	/*
 	 * If @delay is 0, queue @dwork->work immediately.  This is for
@@ -1152,3 +1159,54 @@ void kthread_destroy_worker(struct kthread_worker *worker)
 	kfree(worker);
 }
 EXPORT_SYMBOL(kthread_destroy_worker);
+
+#ifdef CONFIG_BLK_CGROUP
+/**
+ * kthread_associate_blkcg - associate blkcg to current kthread
+ * @css: the cgroup info
+ *
+ * Current thread must be a kthread. The thread is running jobs on behalf of
+ * other threads. In some cases, we expect the jobs attach cgroup info of
+ * original threads instead of that of current thread. This function stores
+ * original thread's cgroup info in current kthread context for later
+ * retrieval.
+ */
+void kthread_associate_blkcg(struct cgroup_subsys_state *css)
+{
+	struct kthread *kthread;
+
+	if (!(current->flags & PF_KTHREAD))
+		return;
+	kthread = to_kthread(current);
+	if (!kthread)
+		return;
+
+	if (kthread->blkcg_css) {
+		css_put(kthread->blkcg_css);
+		kthread->blkcg_css = NULL;
+	}
+	if (css) {
+		css_get(css);
+		kthread->blkcg_css = css;
+	}
+}
+EXPORT_SYMBOL(kthread_associate_blkcg);
+
+/**
+ * kthread_blkcg - get associated blkcg css of current kthread
+ *
+ * Current thread must be a kthread.
+ */
+struct cgroup_subsys_state *kthread_blkcg(void)
+{
+	struct kthread *kthread;
+
+	if (current->flags & PF_KTHREAD) {
+		kthread = to_kthread(current);
+		if (kthread)
+			return kthread->blkcg_css;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(kthread_blkcg);
+#endif
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
index 2b8bdb1925da..b36ceda6488e 100644
--- a/kernel/livepatch/Makefile
+++ b/kernel/livepatch/Makefile
@@ -1,3 +1,3 @@
 obj-$(CONFIG_LIVEPATCH) += livepatch.o
 
-livepatch-objs := core.o patch.o transition.o
+livepatch-objs := core.o patch.o shadow.o transition.o
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index bf8c8fd72589..de9e45dca70f 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -54,11 +54,6 @@ static bool klp_is_module(struct klp_object *obj)
 	return obj->name;
 }
 
-static bool klp_is_object_loaded(struct klp_object *obj)
-{
-	return !obj->name || obj->mod;
-}
-
 /* sets obj->mod if object is not vmlinux and module is found */
 static void klp_find_object_module(struct klp_object *obj)
 {
@@ -285,6 +280,11 @@ static int klp_write_object_relocations(struct module *pmod,
 
 static int __klp_disable_patch(struct klp_patch *patch)
 {
+	struct klp_object *obj;
+
+	if (WARN_ON(!patch->enabled))
+		return -EINVAL;
+
 	if (klp_transition_patch)
 		return -EBUSY;
 
@@ -295,6 +295,10 @@ static int __klp_disable_patch(struct klp_patch *patch)
 
 	klp_init_transition(patch, KLP_UNPATCHED);
 
+	klp_for_each_object(patch, obj)
+		if (obj->patched)
+			klp_pre_unpatch_callback(obj);
+
 	/*
 	 * Enforce the order of the func->transition writes in
 	 * klp_init_transition() and the TIF_PATCH_PENDING writes in
@@ -388,13 +392,18 @@ static int __klp_enable_patch(struct klp_patch *patch)
 		if (!klp_is_object_loaded(obj))
 			continue;
 
-		ret = klp_patch_object(obj);
+		ret = klp_pre_patch_callback(obj);
 		if (ret) {
-			pr_warn("failed to enable patch '%s'\n",
-				patch->mod->name);
+			pr_warn("pre-patch callback failed for object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
+		}
 
-			klp_cancel_transition();
-			return ret;
+		ret = klp_patch_object(obj);
+		if (ret) {
+			pr_warn("failed to patch object '%s'\n",
+				klp_is_module(obj) ? obj->name : "vmlinux");
+			goto err;
 		}
 	}
 
@@ -403,6 +412,11 @@ static int __klp_enable_patch(struct klp_patch *patch)
 	patch->enabled = true;
 
 	return 0;
+err:
+	pr_warn("failed to enable patch '%s'\n", patch->mod->name);
+
+	klp_cancel_transition();
+	return ret;
 }
 
 /**
@@ -854,9 +868,15 @@ static void klp_cleanup_module_patches_limited(struct module *mod,
 			 * is in transition.
 			 */
 			if (patch->enabled || patch == klp_transition_patch) {
+
+				if (patch != klp_transition_patch)
+					klp_pre_unpatch_callback(obj);
+
 				pr_notice("reverting patch '%s' on unloading module '%s'\n",
 					  patch->mod->name, obj->mod->name);
 				klp_unpatch_object(obj);
+
+				klp_post_unpatch_callback(obj);
 			}
 
 			klp_free_object_loaded(obj);
@@ -906,13 +926,25 @@ int klp_module_coming(struct module *mod)
 			pr_notice("applying patch '%s' to loading module '%s'\n",
 				  patch->mod->name, obj->mod->name);
 
+			ret = klp_pre_patch_callback(obj);
+			if (ret) {
+				pr_warn("pre-patch callback failed for object '%s'\n",
+					obj->name);
+				goto err;
+			}
+
 			ret = klp_patch_object(obj);
 			if (ret) {
 				pr_warn("failed to apply patch '%s' to module '%s' (%d)\n",
 					patch->mod->name, obj->mod->name, ret);
+
+				klp_post_unpatch_callback(obj);
 				goto err;
 			}
 
+			if (patch != klp_transition_patch)
+				klp_post_patch_callback(obj);
+
 			break;
 		}
 	}
diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h
index a351601d7f76..48a83d4364cf 100644
--- a/kernel/livepatch/core.h
+++ b/kernel/livepatch/core.h
@@ -2,6 +2,46 @@
 #ifndef _LIVEPATCH_CORE_H
 #define _LIVEPATCH_CORE_H
 
+#include <linux/livepatch.h>
+
 extern struct mutex klp_mutex;
 
+static inline bool klp_is_object_loaded(struct klp_object *obj)
+{
+	return !obj->name || obj->mod;
+}
+
+static inline int klp_pre_patch_callback(struct klp_object *obj)
+{
+	int ret = 0;
+
+	if (obj->callbacks.pre_patch)
+		ret = (*obj->callbacks.pre_patch)(obj);
+
+	obj->callbacks.post_unpatch_enabled = !ret;
+
+	return ret;
+}
+
+static inline void klp_post_patch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_patch)
+		(*obj->callbacks.post_patch)(obj);
+}
+
+static inline void klp_pre_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.pre_unpatch)
+		(*obj->callbacks.pre_unpatch)(obj);
+}
+
+static inline void klp_post_unpatch_callback(struct klp_object *obj)
+{
+	if (obj->callbacks.post_unpatch_enabled &&
+	    obj->callbacks.post_unpatch)
+		(*obj->callbacks.post_unpatch)(obj);
+
+	obj->callbacks.post_unpatch_enabled = false;
+}
+
 #endif /* _LIVEPATCH_CORE_H */
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c
index 52c4e907c14b..82d584225dc6 100644
--- a/kernel/livepatch/patch.c
+++ b/kernel/livepatch/patch.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/bug.h>
 #include <linux/printk.h>
+#include "core.h"
 #include "patch.h"
 #include "transition.h"
 
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c
new file mode 100644
index 000000000000..fdac27588d60
--- /dev/null
+++ b/kernel/livepatch/shadow.c
@@ -0,0 +1,277 @@
+/*
+ * shadow.c - Shadow Variables
+ *
+ * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com>
+ * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com>
+ * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * DOC: Shadow variable API concurrency notes:
+ *
+ * The shadow variable API provides a simple relationship between an
+ * <obj, id> pair and a pointer value.  It is the responsibility of the
+ * caller to provide any mutual exclusion required of the shadow data.
+ *
+ * Once a shadow variable is attached to its parent object via the
+ * klp_shadow_*alloc() API calls, it is considered live: any subsequent
+ * call to klp_shadow_get() may then return the shadow variable's data
+ * pointer.  Callers of klp_shadow_*alloc() should prepare shadow data
+ * accordingly.
+ *
+ * The klp_shadow_*alloc() API calls may allocate memory for new shadow
+ * variable structures.  Their implementation does not call kmalloc
+ * inside any spinlocks, but API callers should pass GFP flags according
+ * to their specific needs.
+ *
+ * The klp_shadow_hash is an RCU-enabled hashtable and is safe against
+ * concurrent klp_shadow_free() and klp_shadow_get() operations.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/livepatch.h>
+
+static DEFINE_HASHTABLE(klp_shadow_hash, 12);
+
+/*
+ * klp_shadow_lock provides exclusive access to the klp_shadow_hash and
+ * the shadow variables it references.
+ */
+static DEFINE_SPINLOCK(klp_shadow_lock);
+
+/**
+ * struct klp_shadow - shadow variable structure
+ * @node:	klp_shadow_hash hash table node
+ * @rcu_head:	RCU is used to safely free this structure
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	data area
+ */
+struct klp_shadow {
+	struct hlist_node node;
+	struct rcu_head rcu_head;
+	void *obj;
+	unsigned long id;
+	char data[];
+};
+
+/**
+ * klp_shadow_match() - verify a shadow variable matches given <obj, id>
+ * @shadow:	shadow variable to match
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: true if the shadow variable matches.
+ */
+static inline bool klp_shadow_match(struct klp_shadow *shadow, void *obj,
+				unsigned long id)
+{
+	return shadow->obj == obj && shadow->id == id;
+}
+
+/**
+ * klp_shadow_get() - retrieve a shadow variable data pointer
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+
+	rcu_read_lock();
+
+	hash_for_each_possible_rcu(klp_shadow_hash, shadow, node,
+				   (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			rcu_read_unlock();
+			return shadow->data;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get);
+
+static void *__klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags, bool warn_on_exist)
+{
+	struct klp_shadow *new_shadow;
+	void *shadow_data;
+	unsigned long flags;
+
+	/* Check if the shadow variable already exists */
+	shadow_data = klp_shadow_get(obj, id);
+	if (shadow_data)
+		goto exists;
+
+	/* Allocate a new shadow variable for use inside the lock below */
+	new_shadow = kzalloc(size + sizeof(*new_shadow), gfp_flags);
+	if (!new_shadow)
+		return NULL;
+
+	new_shadow->obj = obj;
+	new_shadow->id = id;
+
+	/* Initialize the shadow variable if data provided */
+	if (data)
+		memcpy(new_shadow->data, data, size);
+
+	/* Look for <obj, id> again under the lock */
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+	shadow_data = klp_shadow_get(obj, id);
+	if (unlikely(shadow_data)) {
+		/*
+		 * Shadow variable was found, throw away speculative
+		 * allocation.
+		 */
+		spin_unlock_irqrestore(&klp_shadow_lock, flags);
+		kfree(new_shadow);
+		goto exists;
+	}
+
+	/* No <obj, id> found, so attach the newly allocated one */
+	hash_add_rcu(klp_shadow_hash, &new_shadow->node,
+		     (unsigned long)new_shadow->obj);
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+
+	return new_shadow->data;
+
+exists:
+	if (warn_on_exist) {
+		WARN(1, "Duplicate shadow variable <%p, %lx>\n", obj, id);
+		return NULL;
+	}
+
+	return shadow_data;
+}
+
+/**
+ * klp_shadow_alloc() - allocate and add a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Allocates @size bytes for new shadow variable data using @gfp_flags
+ * and copies @size bytes from @data into the new shadow variable's own
+ * data space.  If @data is NULL, @size bytes are still allocated, but
+ * no copy is performed.  The new shadow variable is then added to the
+ * global hashtable.
+ *
+ * If an existing <obj, id> shadow variable can be found, this routine
+ * will issue a WARN, exit early and return NULL.
+ *
+ * Return: the shadow variable data element, NULL on duplicate or
+ * failure.
+ */
+void *klp_shadow_alloc(void *obj, unsigned long id, void *data,
+		       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, true);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_alloc);
+
+/**
+ * klp_shadow_get_or_alloc() - get existing or allocate a new shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ * @data:	pointer to data to attach to parent
+ * @size:	size of attached data
+ * @gfp_flags:	GFP mask for allocation
+ *
+ * Returns a pointer to existing shadow data if an <obj, id> shadow
+ * variable is already present.  Otherwise, it creates a new shadow
+ * variable like klp_shadow_alloc().
+ *
+ * This function guarantees that only one shadow variable exists with
+ * the given @id for the given @obj.  It also guarantees that the shadow
+ * variable will be initialized by the given @data only when it did not
+ * exist before.
+ *
+ * Return: the shadow variable data element, NULL on failure.
+ */
+void *klp_shadow_get_or_alloc(void *obj, unsigned long id, void *data,
+			       size_t size, gfp_t gfp_flags)
+{
+	return __klp_shadow_get_or_alloc(obj, id, data, size, gfp_flags, false);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_get_or_alloc);
+
+/**
+ * klp_shadow_free() - detach and free a <obj, id> shadow variable
+ * @obj:	pointer to parent object
+ * @id:		data identifier
+ *
+ * This function releases the memory for this <obj, id> shadow variable
+ * instance, callers should stop referencing it accordingly.
+ */
+void klp_shadow_free(void *obj, unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete <obj, id> from hash */
+	hash_for_each_possible(klp_shadow_hash, shadow, node,
+			       (unsigned long)obj) {
+
+		if (klp_shadow_match(shadow, obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free);
+
+/**
+ * klp_shadow_free_all() - detach and free all <*, id> shadow variables
+ * @id:		data identifier
+ *
+ * This function releases the memory for all <*, id> shadow variable
+ * instances, callers should stop referencing them accordingly.
+ */
+void klp_shadow_free_all(unsigned long id)
+{
+	struct klp_shadow *shadow;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&klp_shadow_lock, flags);
+
+	/* Delete all <*, id> from hash */
+	hash_for_each(klp_shadow_hash, i, shadow, node) {
+		if (klp_shadow_match(shadow, shadow->obj, id)) {
+			hash_del_rcu(&shadow->node);
+			kfree_rcu(shadow, rcu_head);
+		}
+	}
+
+	spin_unlock_irqrestore(&klp_shadow_lock, flags);
+}
+EXPORT_SYMBOL_GPL(klp_shadow_free_all);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index b004a1fb6032..56add6327736 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -82,6 +82,10 @@ static void klp_complete_transition(void)
 	unsigned int cpu;
 	bool immediate_func = false;
 
+	pr_debug("'%s': completing %s transition\n",
+		 klp_transition_patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	if (klp_target_state == KLP_UNPATCHED) {
 		/*
 		 * All tasks have transitioned to KLP_UNPATCHED so we can now
@@ -109,9 +113,6 @@ static void klp_complete_transition(void)
 		}
 	}
 
-	if (klp_target_state == KLP_UNPATCHED && !immediate_func)
-		module_put(klp_transition_patch->mod);
-
 	/* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */
 	if (klp_target_state == KLP_PATCHED)
 		klp_synchronize_transition();
@@ -130,6 +131,27 @@ static void klp_complete_transition(void)
 	}
 
 done:
+	klp_for_each_object(klp_transition_patch, obj) {
+		if (!klp_is_object_loaded(obj))
+			continue;
+		if (klp_target_state == KLP_PATCHED)
+			klp_post_patch_callback(obj);
+		else if (klp_target_state == KLP_UNPATCHED)
+			klp_post_unpatch_callback(obj);
+	}
+
+	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
+		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
+	/*
+	 * See complementary comment in __klp_enable_patch() for why we
+	 * keep the module reference for immediate patches.
+	 */
+	if (!klp_transition_patch->immediate && !immediate_func &&
+	    klp_target_state == KLP_UNPATCHED) {
+		module_put(klp_transition_patch->mod);
+	}
+
 	klp_target_state = KLP_UNDEFINED;
 	klp_transition_patch = NULL;
 }
@@ -145,6 +167,9 @@ void klp_cancel_transition(void)
 	if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED))
 		return;
 
+	pr_debug("'%s': canceling patching transition, going to unpatch\n",
+		 klp_transition_patch->mod->name);
+
 	klp_target_state = KLP_UNPATCHED;
 	klp_complete_transition();
 }
@@ -408,9 +433,6 @@ void klp_try_complete_transition(void)
 	}
 
 success:
-	pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name,
-		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
-
 	/* we're done, now cleanup the data structures */
 	klp_complete_transition();
 }
@@ -426,7 +448,8 @@ void klp_start_transition(void)
 
 	WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED);
 
-	pr_notice("'%s': %s...\n", klp_transition_patch->mod->name,
+	pr_notice("'%s': starting %s transition\n",
+		  klp_transition_patch->mod->name,
 		  klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
 
 	/*
@@ -482,6 +505,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
 	 */
 	klp_target_state = state;
 
+	pr_debug("'%s': initializing %s transition\n", patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching" : "unpatching");
+
 	/*
 	 * If the patch can be applied or reverted immediately, skip the
 	 * per-task transitions.
@@ -547,6 +573,11 @@ void klp_reverse_transition(void)
 	unsigned int cpu;
 	struct task_struct *g, *task;
 
+	pr_debug("'%s': reversing transition from %s\n",
+		 klp_transition_patch->mod->name,
+		 klp_target_state == KLP_PATCHED ? "patching to unpatching" :
+						   "unpatching to patching");
+
 	klp_transition_patch->enabled = !klp_transition_patch->enabled;
 
 	klp_target_state = !klp_target_state;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index db933d063bfc..9776da8db180 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -47,7 +47,6 @@
 #include <linux/stringify.h>
 #include <linux/bitops.h>
 #include <linux/gfp.h>
-#include <linux/kmemcheck.h>
 #include <linux/random.h>
 #include <linux/jhash.h>
 
@@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name,
 {
 	int i;
 
-	kmemcheck_mark_initialized(lock, sizeof(*lock));
-
 	for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
 		lock->class_cache[i] = NULL;
 
diff --git a/kernel/module.c b/kernel/module.c
index 32c2cdaccd93..dea01ac9cb74 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -847,10 +847,8 @@ static int add_module_usage(struct module *a, struct module *b)
 
 	pr_debug("Allocating new usage for %s.\n", a->name);
 	use = kmalloc(sizeof(*use), GFP_ATOMIC);
-	if (!use) {
-		pr_warn("%s: out of memory loading\n", a->name);
+	if (!use)
 		return -ENOMEM;
-	}
 
 	use->source = a;
 	use->target = b;
@@ -3483,6 +3481,8 @@ static noinline int do_init_module(struct module *mod)
 	if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
 		async_synchronize_full();
 
+	ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
+			mod->init_layout.size);
 	mutex_lock(&module_mutex);
 	/* Drop initial reference. */
 	module_put(mod);
@@ -4157,7 +4157,7 @@ static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
 	char buf[MODULE_FLAGS_BUF_SIZE];
-	unsigned long value;
+	void *value;
 
 	/* We always ignore unformed modules. */
 	if (mod->state == MODULE_STATE_UNFORMED)
@@ -4173,8 +4173,8 @@ static int m_show(struct seq_file *m, void *p)
 		   mod->state == MODULE_STATE_COMING ? "Loading" :
 		   "Live");
 	/* Used by oprofile and other similar tools. */
-	value = m->private ? 0 : (unsigned long)mod->core_layout.base;
-	seq_printf(m, " 0x" KALLSYM_FMT, value);
+	value = m->private ? NULL : mod->core_layout.base;
+	seq_printf(m, " 0x%px", value);
 
 	/* Taints info */
 	if (mod->taints)
diff --git a/kernel/padata.c b/kernel/padata.c
index 868f947166d7..57c0074d50cc 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -131,6 +131,7 @@ int padata_do_parallel(struct padata_instance *pinst,
 	padata->cb_cpu = cb_cpu;
 
 	target_cpu = padata_cpu_hash(pd);
+	padata->cpu = target_cpu;
 	queue = per_cpu_ptr(pd->pqueue, target_cpu);
 
 	spin_lock(&queue->parallel.lock);
@@ -275,11 +276,51 @@ static void padata_reorder(struct parallel_data *pd)
 	return;
 }
 
-static void padata_reorder_timer(unsigned long arg)
+static void invoke_padata_reorder(struct work_struct *work)
 {
-	struct parallel_data *pd = (struct parallel_data *)arg;
+	struct padata_parallel_queue *pqueue;
+	struct parallel_data *pd;
 
+	local_bh_disable();
+	pqueue = container_of(work, struct padata_parallel_queue, reorder_work);
+	pd = pqueue->pd;
 	padata_reorder(pd);
+	local_bh_enable();
+}
+
+static void padata_reorder_timer(struct timer_list *t)
+{
+	struct parallel_data *pd = from_timer(pd, t, timer);
+	unsigned int weight;
+	int target_cpu, cpu;
+
+	cpu = get_cpu();
+
+	/* We don't lock pd here to not interfere with parallel processing
+	 * padata_reorder() calls on other CPUs. We just need any CPU out of
+	 * the cpumask.pcpu set. It would be nice if it's the right one but
+	 * it doesn't matter if we're off to the next one by using an outdated
+	 * pd->processed value.
+	 */
+	weight = cpumask_weight(pd->cpumask.pcpu);
+	target_cpu = padata_index_to_cpu(pd, pd->processed % weight);
+
+	/* ensure to call the reorder callback on the correct CPU */
+	if (cpu != target_cpu) {
+		struct padata_parallel_queue *pqueue;
+		struct padata_instance *pinst;
+
+		/* The timer function is serialized wrt itself -- no locking
+		 * needed.
+		 */
+		pinst = pd->pinst;
+		pqueue = per_cpu_ptr(pd->pqueue, target_cpu);
+		queue_work_on(target_cpu, pinst->wq, &pqueue->reorder_work);
+	} else {
+		padata_reorder(pd);
+	}
+
+	put_cpu();
 }
 
 static void padata_serial_worker(struct work_struct *serial_work)
@@ -323,10 +364,21 @@ void padata_do_serial(struct padata_priv *padata)
 	int cpu;
 	struct padata_parallel_queue *pqueue;
 	struct parallel_data *pd;
+	int reorder_via_wq = 0;
 
 	pd = padata->pd;
 
 	cpu = get_cpu();
+
+	/* We need to run on the same CPU padata_do_parallel(.., padata, ..)
+	 * was called on -- or, at least, enqueue the padata object into the
+	 * correct per-cpu queue.
+	 */
+	if (cpu != padata->cpu) {
+		reorder_via_wq = 1;
+		cpu = padata->cpu;
+	}
+
 	pqueue = per_cpu_ptr(pd->pqueue, cpu);
 
 	spin_lock(&pqueue->reorder.lock);
@@ -336,7 +388,13 @@ void padata_do_serial(struct padata_priv *padata)
 
 	put_cpu();
 
-	padata_reorder(pd);
+	/* If we're running on the wrong CPU, call padata_reorder() via a
+	 * kernel worker.
+	 */
+	if (reorder_via_wq)
+		queue_work_on(cpu, pd->pinst->wq, &pqueue->reorder_work);
+	else
+		padata_reorder(pd);
 }
 EXPORT_SYMBOL(padata_do_serial);
 
@@ -384,8 +442,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
 	struct padata_parallel_queue *pqueue;
 
 	cpu_index = 0;
-	for_each_cpu(cpu, pd->cpumask.pcpu) {
+	for_each_possible_cpu(cpu) {
 		pqueue = per_cpu_ptr(pd->pqueue, cpu);
+
+		if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+			pqueue->cpu_index = -1;
+			continue;
+		}
+
 		pqueue->pd = pd;
 		pqueue->cpu_index = cpu_index;
 		cpu_index++;
@@ -393,6 +457,7 @@ static void padata_init_pqueues(struct parallel_data *pd)
 		__padata_list_init(&pqueue->reorder);
 		__padata_list_init(&pqueue->parallel);
 		INIT_WORK(&pqueue->work, padata_parallel_worker);
+		INIT_WORK(&pqueue->reorder_work, invoke_padata_reorder);
 		atomic_set(&pqueue->num_obj, 0);
 	}
 }
@@ -420,7 +485,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 
 	padata_init_pqueues(pd);
 	padata_init_squeues(pd);
-	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
+	timer_setup(&pd->timer, padata_reorder_timer, 0);
 	atomic_set(&pd->seq_nr, -1);
 	atomic_set(&pd->reorder_objects, 0);
 	atomic_set(&pd->refcnt, 0);
diff --git a/kernel/panic.c b/kernel/panic.c
index bdd18afa19a4..2cfef408fec9 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,6 +27,8 @@
 #include <linux/console.h>
 #include <linux/bug.h>
 #include <linux/ratelimit.h>
+#include <linux/debugfs.h>
+#include <asm/sections.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -322,6 +324,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
 	{ 'E', ' ', true },	/* TAINT_UNSIGNED_MODULE */
 	{ 'L', ' ', false },	/* TAINT_SOFTLOCKUP */
 	{ 'K', ' ', true },	/* TAINT_LIVEPATCH */
+	{ 'X', ' ', true },	/* TAINT_AUX */
 };
 
 /**
@@ -343,6 +346,7 @@ const struct taint_flag taint_flags[TAINT_FLAGS_COUNT] = {
  *  'E' - Unsigned module has been loaded.
  *  'L' - A soft lockup has previously occurred.
  *  'K' - Kernel has been live patched.
+ *  'X' - Auxiliary taint, for distros' use.
  *
  *	The string is overwritten by the next call to print_tainted().
  */
@@ -518,7 +522,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();
 
-	pr_warn("------------[ cut here ]------------\n");
+	if (args)
+		pr_warn(CUT_HERE);
 
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
@@ -582,9 +587,49 @@ EXPORT_SYMBOL(warn_slowpath_fmt_taint);
 
 void warn_slowpath_null(const char *file, int line)
 {
+	pr_warn(CUT_HERE);
 	__warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL);
 }
 EXPORT_SYMBOL(warn_slowpath_null);
+#else
+void __warn_printk(const char *fmt, ...)
+{
+	va_list args;
+
+	pr_warn(CUT_HERE);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+}
+EXPORT_SYMBOL(__warn_printk);
+#endif
+
+#ifdef CONFIG_BUG
+
+/* Support resetting WARN*_ONCE state */
+
+static int clear_warn_once_set(void *data, u64 val)
+{
+	generic_bug_clear_once();
+	memset(__start_once, 0, __end_once - __start_once);
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(clear_warn_once_fops,
+			NULL,
+			clear_warn_once_set,
+			"%lld\n");
+
+static __init int register_warn_debugfs(void)
+{
+	/* Don't care about failure */
+	debugfs_create_file("clear_warn_once", 0200, NULL,
+			    NULL, &clear_warn_once_fops);
+	return 0;
+}
+
+device_initcall(register_warn_debugfs);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 020dedbdf066..b13b624e2c49 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -39,11 +39,8 @@
 #include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 #include <linux/sched/task.h>
+#include <linux/idr.h>
 
-#define pid_hashfn(nr, ns)	\
-	hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift)
-static struct hlist_head *pid_hash;
-static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
 int pid_max = PID_MAX_DEFAULT;
@@ -53,15 +50,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-static inline int mk_pid(struct pid_namespace *pid_ns,
-		struct pidmap *map, int off)
-{
-	return (map - pid_ns->pidmap)*BITS_PER_PAGE + off;
-}
-
-#define find_next_offset(map, off)					\
-		find_next_zero_bit((map)->page, BITS_PER_PAGE, off)
-
 /*
  * PID-map pages start out as NULL, they get allocated upon
  * first use and are never deallocated. This way a low pid_max
@@ -70,11 +58,8 @@ static inline int mk_pid(struct pid_namespace *pid_ns,
  */
 struct pid_namespace init_pid_ns = {
 	.kref = KREF_INIT(2),
-	.pidmap = {
-		[ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL }
-	},
-	.last_pid = 0,
-	.nr_hashed = PIDNS_HASH_ADDING,
+	.idr = IDR_INIT,
+	.pid_allocated = PIDNS_ADDING,
 	.level = 0,
 	.child_reaper = &init_task,
 	.user_ns = &init_user_ns,
@@ -101,138 +86,6 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
 
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
 
-static void free_pidmap(struct upid *upid)
-{
-	int nr = upid->nr;
-	struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
-	int offset = nr & BITS_PER_PAGE_MASK;
-
-	clear_bit(offset, map->page);
-	atomic_inc(&map->nr_free);
-}
-
-/*
- * If we started walking pids at 'base', is 'a' seen before 'b'?
- */
-static int pid_before(int base, int a, int b)
-{
-	/*
-	 * This is the same as saying
-	 *
-	 * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT
-	 * and that mapping orders 'a' and 'b' with respect to 'base'.
-	 */
-	return (unsigned)(a - base) < (unsigned)(b - base);
-}
-
-/*
- * We might be racing with someone else trying to set pid_ns->last_pid
- * at the pid allocation time (there's also a sysctl for this, but racing
- * with this one is OK, see comment in kernel/pid_namespace.c about it).
- * We want the winner to have the "later" value, because if the
- * "earlier" value prevails, then a pid may get reused immediately.
- *
- * Since pids rollover, it is not sufficient to just pick the bigger
- * value.  We have to consider where we started counting from.
- *
- * 'base' is the value of pid_ns->last_pid that we observed when
- * we started looking for a pid.
- *
- * 'pid' is the pid that we eventually found.
- */
-static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid)
-{
-	int prev;
-	int last_write = base;
-	do {
-		prev = last_write;
-		last_write = cmpxchg(&pid_ns->last_pid, prev, pid);
-	} while ((prev != last_write) && (pid_before(base, last_write, pid)));
-}
-
-static int alloc_pidmap(struct pid_namespace *pid_ns)
-{
-	int i, offset, max_scan, pid, last = pid_ns->last_pid;
-	struct pidmap *map;
-
-	pid = last + 1;
-	if (pid >= pid_max)
-		pid = RESERVED_PIDS;
-	offset = pid & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
-	/*
-	 * If last_pid points into the middle of the map->page we
-	 * want to scan this bitmap block twice, the second time
-	 * we start with offset == 0 (or RESERVED_PIDS).
-	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
-	for (i = 0; i <= max_scan; ++i) {
-		if (unlikely(!map->page)) {
-			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-			/*
-			 * Free the page if someone raced with us
-			 * installing it:
-			 */
-			spin_lock_irq(&pidmap_lock);
-			if (!map->page) {
-				map->page = page;
-				page = NULL;
-			}
-			spin_unlock_irq(&pidmap_lock);
-			kfree(page);
-			if (unlikely(!map->page))
-				return -ENOMEM;
-		}
-		if (likely(atomic_read(&map->nr_free))) {
-			for ( ; ; ) {
-				if (!test_and_set_bit(offset, map->page)) {
-					atomic_dec(&map->nr_free);
-					set_last_pid(pid_ns, last, pid);
-					return pid;
-				}
-				offset = find_next_offset(map, offset);
-				if (offset >= BITS_PER_PAGE)
-					break;
-				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
-					break;
-			}
-		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
-			++map;
-			offset = 0;
-		} else {
-			map = &pid_ns->pidmap[0];
-			offset = RESERVED_PIDS;
-			if (unlikely(last == offset))
-				break;
-		}
-		pid = mk_pid(pid_ns, map, offset);
-	}
-	return -EAGAIN;
-}
-
-int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
-{
-	int offset;
-	struct pidmap *map, *end;
-
-	if (last >= PID_MAX_LIMIT)
-		return -1;
-
-	offset = (last + 1) & BITS_PER_PAGE_MASK;
-	map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
-	end = &pid_ns->pidmap[PIDMAP_ENTRIES];
-	for (; map < end; map++, offset = 0) {
-		if (unlikely(!map->page))
-			continue;
-		offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
-		if (offset < BITS_PER_PAGE)
-			return mk_pid(pid_ns, map, offset);
-	}
-	return -1;
-}
-
 void put_pid(struct pid *pid)
 {
 	struct pid_namespace *ns;
@@ -265,8 +118,7 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
-		hlist_del_rcu(&upid->pid_chain);
-		switch(--ns->nr_hashed) {
+		switch (--ns->pid_allocated) {
 		case 2:
 		case 1:
 			/* When all that is left in the pid namespace
@@ -275,21 +127,20 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
-		case PIDNS_HASH_ADDING:
+		case PIDNS_ADDING:
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
-			ns->nr_hashed = 0;
+			ns->pid_allocated = 0;
 			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
 			break;
 		}
+
+		idr_remove(&ns->idr, upid->nr);
 	}
 	spin_unlock_irqrestore(&pidmap_lock, flags);
 
-	for (i = 0; i <= pid->level; i++)
-		free_pidmap(pid->numbers + i);
-
 	call_rcu(&pid->rcu, delayed_put_pid);
 }
 
@@ -308,8 +159,29 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	tmp = ns;
 	pid->level = ns->level;
+
 	for (i = ns->level; i >= 0; i--) {
-		nr = alloc_pidmap(tmp);
+		int pid_min = 1;
+
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&pidmap_lock);
+
+		/*
+		 * init really needs pid 1, but after reaching the maximum
+		 * wrap back to RESERVED_PIDS
+		 */
+		if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
+			pid_min = RESERVED_PIDS;
+
+		/*
+		 * Store a null pointer so find_pid_ns does not find
+		 * a partially initialized PID (see below).
+		 */
+		nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
+				      pid_max, GFP_ATOMIC);
+		spin_unlock_irq(&pidmap_lock);
+		idr_preload_end();
+
 		if (nr < 0) {
 			retval = nr;
 			goto out_free;
@@ -334,12 +206,12 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 
 	upid = pid->numbers + ns->level;
 	spin_lock_irq(&pidmap_lock);
-	if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
+	if (!(ns->pid_allocated & PIDNS_ADDING))
 		goto out_unlock;
 	for ( ; upid >= pid->numbers; --upid) {
-		hlist_add_head_rcu(&upid->pid_chain,
-				&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
-		upid->ns->nr_hashed++;
+		/* Make the PID visible to find_pid_ns. */
+		idr_replace(&upid->ns->idr, pid, upid->nr);
+		upid->ns->pid_allocated++;
 	}
 	spin_unlock_irq(&pidmap_lock);
 
@@ -350,8 +222,11 @@ out_unlock:
 	put_pid_ns(ns);
 
 out_free:
+	spin_lock_irq(&pidmap_lock);
 	while (++i <= ns->level)
-		free_pidmap(pid->numbers + i);
+		idr_remove(&ns->idr, (pid->numbers + i)->nr);
+
+	spin_unlock_irq(&pidmap_lock);
 
 	kmem_cache_free(ns->pid_cachep, pid);
 	return ERR_PTR(retval);
@@ -360,21 +235,13 @@ out_free:
 void disable_pid_allocation(struct pid_namespace *ns)
 {
 	spin_lock_irq(&pidmap_lock);
-	ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+	ns->pid_allocated &= ~PIDNS_ADDING;
 	spin_unlock_irq(&pidmap_lock);
 }
 
 struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
 {
-	struct upid *pnr;
-
-	hlist_for_each_entry_rcu(pnr,
-			&pid_hash[pid_hashfn(nr, ns)], pid_chain)
-		if (pnr->nr == nr && pnr->ns == ns)
-			return container_of(pnr, struct pid,
-					numbers[ns->level]);
-
-	return NULL;
+	return idr_find(&ns->idr, nr);
 }
 EXPORT_SYMBOL_GPL(find_pid_ns);
 
@@ -530,6 +397,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 		if (type != PIDTYPE_PID) {
 			if (type == __PIDTYPE_TGID)
 				type = PIDTYPE_PID;
+
 			task = task->group_leader;
 		}
 		nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
@@ -553,35 +421,13 @@ EXPORT_SYMBOL_GPL(task_active_pid_ns);
  */
 struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
 {
-	struct pid *pid;
-
-	do {
-		pid = find_pid_ns(nr, ns);
-		if (pid)
-			break;
-		nr = next_pidmap(ns, nr);
-	} while (nr > 0);
-
-	return pid;
-}
-
-/*
- * The pid hash table is scaled according to the amount of memory in the
- * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
- * more.
- */
-void __init pidhash_init(void)
-{
-	pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
-					   HASH_EARLY | HASH_SMALL | HASH_ZERO,
-					   &pidhash_shift, NULL,
-					   0, 4096);
+	return idr_get_next(&ns->idr, &nr);
 }
 
-void __init pidmap_init(void)
+void __init pid_idr_init(void)
 {
 	/* Verify no one has done anything silly: */
-	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
 	pid_max = min(pid_max_max, max_t(int, pid_max,
@@ -590,10 +436,7 @@ void __init pidmap_init(void)
 				PIDS_PER_CPU_MIN * num_possible_cpus());
 	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
 
-	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	/* Reserve PID 0. We never call free_pidmap(0) */
-	set_bit(0, init_pid_ns.pidmap[0].page);
-	atomic_dec(&init_pid_ns.pidmap[0].nr_free);
+	idr_init(&init_pid_ns.idr);
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
 			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 4918314893bc..0b53eef7d34b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -21,6 +21,7 @@
 #include <linux/export.h>
 #include <linux/sched/task.h>
 #include <linux/sched/signal.h>
+#include <linux/idr.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -98,7 +99,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	struct pid_namespace *ns;
 	unsigned int level = parent_pid_ns->level + 1;
 	struct ucounts *ucounts;
-	int i;
 	int err;
 
 	err = -EINVAL;
@@ -117,17 +117,15 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	if (ns == NULL)
 		goto out_dec;
 
-	ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
-	if (!ns->pidmap[0].page)
-		goto out_free;
+	idr_init(&ns->idr);
 
 	ns->pid_cachep = create_pid_cachep(level + 1);
 	if (ns->pid_cachep == NULL)
-		goto out_free_map;
+		goto out_free_idr;
 
 	err = ns_alloc_inum(&ns->ns);
 	if (err)
-		goto out_free_map;
+		goto out_free_idr;
 	ns->ns.ops = &pidns_operations;
 
 	kref_init(&ns->kref);
@@ -135,20 +133,13 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->parent = get_pid_ns(parent_pid_ns);
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
-	ns->nr_hashed = PIDNS_HASH_ADDING;
+	ns->pid_allocated = PIDNS_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
 
-	set_bit(0, ns->pidmap[0].page);
-	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
-
-	for (i = 1; i < PIDMAP_ENTRIES; i++)
-		atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
-
 	return ns;
 
-out_free_map:
-	kfree(ns->pidmap[0].page);
-out_free:
+out_free_idr:
+	idr_destroy(&ns->idr);
 	kmem_cache_free(pid_ns_cachep, ns);
 out_dec:
 	dec_pid_namespaces(ucounts);
@@ -168,11 +159,9 @@ static void delayed_free_pidns(struct rcu_head *p)
 
 static void destroy_pid_namespace(struct pid_namespace *ns)
 {
-	int i;
-
 	ns_free_inum(&ns->ns);
-	for (i = 0; i < PIDMAP_ENTRIES; i++)
-		kfree(ns->pidmap[i].page);
+
+	idr_destroy(&ns->idr);
 	call_rcu(&ns->rcu, delayed_free_pidns);
 }
 
@@ -213,6 +202,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	int rc;
 	struct task_struct *task, *me = current;
 	int init_pids = thread_group_leader(me) ? 1 : 2;
+	struct pid *pid;
 
 	/* Don't allow any more processes into the pid namespace */
 	disable_pid_allocation(pid_ns);
@@ -239,20 +229,16 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * 	  maintain a tasklist for each pid namespace.
 	 *
 	 */
+	rcu_read_lock();
 	read_lock(&tasklist_lock);
-	nr = next_pidmap(pid_ns, 1);
-	while (nr > 0) {
-		rcu_read_lock();
-
-		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+	nr = 2;
+	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
+		task = pid_task(pid, PIDTYPE_PID);
 		if (task && !__fatal_signal_pending(task))
 			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
-
-		rcu_read_unlock();
-
-		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 
 	/*
 	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
@@ -268,7 +254,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 * sys_wait4() above can't reap the EXIT_DEAD children but we do not
 	 * really care, we could reparent them to the global init. We could
 	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(nr_hashed == 0) calls proc_cleanup_work(),
+	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
 	 * pid_ns can not go away until proc_kill_sb() drops the reference.
 	 *
 	 * But this ns can also have other tasks injected by setns()+fork().
@@ -282,7 +268,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (pid_ns->nr_hashed == init_pids)
+		if (pid_ns->pid_allocated == init_pids)
 			break;
 		schedule();
 	}
@@ -301,6 +287,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 {
 	struct pid_namespace *pid_ns = task_active_pid_ns(current);
 	struct ctl_table tmp = *table;
+	int ret, next;
 
 	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
 		return -EPERM;
@@ -311,8 +298,14 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 * it should synchronize its usage with external means.
 	 */
 
-	tmp.data = &pid_ns->last_pid;
-	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	next = idr_get_cursor(&pid_ns->idr) - 1;
+
+	tmp.data = &next;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (!ret && write)
+		idr_set_cursor(&pid_ns->idr, next + 1);
+
+	return ret;
 }
 
 extern int pid_max;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e8517b63eb37..e880ca22c5a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,20 +259,6 @@ config APM_EMULATION
 	  anything, try disabling/enabling this option (or disabling/enabling
 	  APM in your BIOS).
 
-config PM_OPP
-	bool
-	select SRCU
-	---help---
-	  SOCs have a standard set of tuples consisting of frequency and
-	  voltage pairs that the device will support per voltage domain. This
-	  is called Operating Performance Point or OPP. The actual definitions
-	  of OPP varies over silicon within the same family of devices.
-
-	  OPP layer organizes the data internally using device pointers
-	  representing individual voltage domains and provides SOC
-	  implementations a ready to use framework to manage OPPs.
-	  For more information, read <file:Documentation/power/opp.txt>
-
 config PM_CLK
 	def_bool y
 	depends on PM && HAVE_CLK
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 97b0df71303e..9d7503910ce2 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -701,8 +701,8 @@ static int __init pm_qos_power_init(void)
 	for (i = PM_QOS_CPU_DMA_LATENCY; i < PM_QOS_NUM_CLASSES; i++) {
 		ret = register_pm_qos_misc(pm_qos_array[i], d);
 		if (ret < 0) {
-			printk(KERN_ERR "pm_qos_param: %s setup failed\n",
-			       pm_qos_array[i]->name);
+			pr_err("%s: %s setup failed\n",
+			       __func__, pm_qos_array[i]->name);
 			return ret;
 		}
 	}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0972a8e09d08..bce0464524d8 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,8 @@
  *
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -967,7 +969,7 @@ void __init __register_nosave_region(unsigned long start_pfn,
 	region->end_pfn = end_pfn;
 	list_add_tail(&region->list, &nosave_regions);
  Report:
-	printk(KERN_INFO "PM: Registered nosave memory: [mem %#010llx-%#010llx]\n",
+	pr_info("Registered nosave memory: [mem %#010llx-%#010llx]\n",
 		(unsigned long long) start_pfn << PAGE_SHIFT,
 		((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
 }
@@ -1039,7 +1041,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
 	list_for_each_entry(region, &nosave_regions, list) {
 		unsigned long pfn;
 
-		pr_debug("PM: Marking nosave pages: [mem %#010llx-%#010llx]\n",
+		pr_debug("Marking nosave pages: [mem %#010llx-%#010llx]\n",
 			 (unsigned long long) region->start_pfn << PAGE_SHIFT,
 			 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
 				- 1);
@@ -1095,7 +1097,7 @@ int create_basic_memory_bitmaps(void)
 	free_pages_map = bm2;
 	mark_nosave_pages(forbidden_pages_map);
 
-	pr_debug("PM: Basic memory bitmaps created\n");
+	pr_debug("Basic memory bitmaps created\n");
 
 	return 0;
 
@@ -1131,7 +1133,7 @@ void free_basic_memory_bitmaps(void)
 	memory_bm_free(bm2, PG_UNSAFE_CLEAR);
 	kfree(bm2);
 
-	pr_debug("PM: Basic memory bitmaps freed\n");
+	pr_debug("Basic memory bitmaps freed\n");
 }
 
 void clear_free_pages(void)
@@ -1152,7 +1154,7 @@ void clear_free_pages(void)
 		pfn = memory_bm_next_pfn(bm);
 	}
 	memory_bm_position_reset(bm);
-	pr_info("PM: free pages cleared after restore\n");
+	pr_info("free pages cleared after restore\n");
 #endif /* PAGE_POISONING_ZERO */
 }
 
@@ -1690,7 +1692,7 @@ int hibernate_preallocate_memory(void)
 	ktime_t start, stop;
 	int error;
 
-	printk(KERN_INFO "PM: Preallocating image memory... ");
+	pr_info("Preallocating image memory... ");
 	start = ktime_get();
 
 	error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY);
@@ -1821,13 +1823,13 @@ int hibernate_preallocate_memory(void)
 
  out:
 	stop = ktime_get();
-	printk(KERN_CONT "done (allocated %lu pages)\n", pages);
+	pr_cont("done (allocated %lu pages)\n", pages);
 	swsusp_show_speed(start, stop, pages, "Allocated");
 
 	return 0;
 
  err_out:
-	printk(KERN_CONT "\n");
+	pr_cont("\n");
 	swsusp_free();
 	return -ENOMEM;
 }
@@ -1867,8 +1869,8 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
 			free += zone_page_state(zone, NR_FREE_PAGES);
 
 	nr_pages += count_pages_for_highmem(nr_highmem);
-	pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n",
-		nr_pages, PAGES_FOR_IO, free);
+	pr_debug("Normal pages needed: %u + %u, available pages: %u\n",
+		 nr_pages, PAGES_FOR_IO, free);
 
 	return free > nr_pages + PAGES_FOR_IO;
 }
@@ -1882,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
  */
 static inline int get_highmem_buffer(int safe_needed)
 {
-	buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed);
+	buffer = get_image_page(GFP_ATOMIC, safe_needed);
 	return buffer ? 0 : -ENOMEM;
 }
 
@@ -1943,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm,
 		while (nr_pages-- > 0) {
 			struct page *page;
 
-			page = alloc_image_page(GFP_ATOMIC | __GFP_COLD);
+			page = alloc_image_page(GFP_ATOMIC);
 			if (!page)
 				goto err_out;
 			memory_bm_set_bit(copy_bm, page_to_pfn(page));
@@ -1961,20 +1963,20 @@ asmlinkage __visible int swsusp_save(void)
 {
 	unsigned int nr_pages, nr_highmem;
 
-	printk(KERN_INFO "PM: Creating hibernation image:\n");
+	pr_info("Creating hibernation image:\n");
 
 	drain_local_pages(NULL);
 	nr_pages = count_data_pages();
 	nr_highmem = count_highmem_pages();
-	printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
+	pr_info("Need to copy %u pages\n", nr_pages + nr_highmem);
 
 	if (!enough_free_mem(nr_pages, nr_highmem)) {
-		printk(KERN_ERR "PM: Not enough free memory\n");
+		pr_err("Not enough free memory\n");
 		return -ENOMEM;
 	}
 
 	if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
-		printk(KERN_ERR "PM: Memory allocation failed\n");
+		pr_err("Memory allocation failed\n");
 		return -ENOMEM;
 	}
 
@@ -1995,8 +1997,7 @@ asmlinkage __visible int swsusp_save(void)
 	nr_copy_pages = nr_pages;
 	nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
 
-	printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
-		nr_pages);
+	pr_info("Hibernation image created (%d pages copied)\n", nr_pages);
 
 	return 0;
 }
@@ -2170,7 +2171,7 @@ static int check_header(struct swsusp_info *info)
 	if (!reason && info->num_physpages != get_num_physpages())
 		reason = "memory size";
 	if (reason) {
-		printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
+		pr_err("Image mismatch: %s\n", reason);
 		return -EPERM;
 	}
 	return 0;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ccd2d20e6b06..0685c4499431 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -437,7 +437,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
 			error = suspend_ops->enter(state);
 			trace_suspend_resume(TPS("machine_suspend"),
 				state, false);
-			events_check_enabled = false;
 		} else if (*wakeup) {
 			error = -EBUSY;
 		}
@@ -582,6 +581,7 @@ static int enter_state(suspend_state_t state)
 	pm_restore_gfp_mask();
 
  Finish:
+	events_check_enabled = false;
 	pm_pr_dbg("Finishing wakeup.\n");
 	suspend_finish();
  Unlock:
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index d7cdc426ee38..293ead59eccc 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -12,6 +12,8 @@
  *
  */
 
+#define pr_fmt(fmt) "PM: " fmt
+
 #include <linux/module.h>
 #include <linux/file.h>
 #include <linux/delay.h>
@@ -241,9 +243,9 @@ static void hib_end_io(struct bio *bio)
 	struct page *page = bio->bi_io_vec[0].bv_page;
 
 	if (bio->bi_status) {
-		printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
-				MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
-				(unsigned long long)bio->bi_iter.bi_sector);
+		pr_alert("Read-error on swap-device (%u:%u:%Lu)\n",
+			 MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)),
+			 (unsigned long long)bio->bi_iter.bi_sector);
 	}
 
 	if (bio_data_dir(bio) == WRITE)
@@ -273,8 +275,8 @@ static int hib_submit_io(int op, int op_flags, pgoff_t page_off, void *addr,
 	bio_set_op_attrs(bio, op, op_flags);
 
 	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
-		printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
-			(unsigned long long)bio->bi_iter.bi_sector);
+		pr_err("Adding page to bio failed at %llu\n",
+		       (unsigned long long)bio->bi_iter.bi_sector);
 		bio_put(bio);
 		return -EFAULT;
 	}
@@ -319,7 +321,7 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
 		error = hib_submit_io(REQ_OP_WRITE, REQ_SYNC,
 				      swsusp_resume_block, swsusp_header, NULL);
 	} else {
-		printk(KERN_ERR "PM: Swap header not found!\n");
+		pr_err("Swap header not found!\n");
 		error = -ENODEV;
 	}
 	return error;
@@ -413,8 +415,7 @@ static int get_swap_writer(struct swap_map_handle *handle)
 	ret = swsusp_swap_check();
 	if (ret) {
 		if (ret != -ENOSPC)
-			printk(KERN_ERR "PM: Cannot find swap device, try "
-					"swapon -a.\n");
+			pr_err("Cannot find swap device, try swapon -a\n");
 		return ret;
 	}
 	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -491,9 +492,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
 {
 	if (!error) {
 		flush_swap_writer(handle);
-		printk(KERN_INFO "PM: S");
+		pr_info("S");
 		error = mark_swapfiles(handle, flags);
-		printk("|\n");
+		pr_cont("|\n");
 	}
 
 	if (error)
@@ -542,7 +543,7 @@ static int save_image(struct swap_map_handle *handle,
 
 	hib_init_batch(&hb);
 
-	printk(KERN_INFO "PM: Saving image data pages (%u pages)...\n",
+	pr_info("Saving image data pages (%u pages)...\n",
 		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
@@ -557,8 +558,8 @@ static int save_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_INFO "PM: Image saving progress: %3d%%\n",
-			       nr_pages / m * 10);
+			pr_info("Image saving progress: %3d%%\n",
+				nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_io(&hb);
@@ -566,7 +567,7 @@ static int save_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_INFO "PM: Image saving done.\n");
+		pr_info("Image saving done\n");
 	swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 	return ret;
 }
@@ -692,14 +693,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
 	if (!page) {
-		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
 
 	data = vmalloc(sizeof(*data) * nr_threads);
 	if (!data) {
-		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		pr_err("Failed to allocate LZO data\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -708,7 +709,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 
 	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
 	if (!crc) {
-		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -726,8 +727,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 		                            "image_compress/%u", thr);
 		if (IS_ERR(data[thr].thr)) {
 			data[thr].thr = NULL;
-			printk(KERN_ERR
-			       "PM: Cannot start compression threads\n");
+			pr_err("Cannot start compression threads\n");
 			ret = -ENOMEM;
 			goto out_clean;
 		}
@@ -749,7 +749,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
 	if (IS_ERR(crc->thr)) {
 		crc->thr = NULL;
-		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		pr_err("Cannot start CRC32 thread\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -760,10 +760,9 @@ static int save_image_lzo(struct swap_map_handle *handle,
 	 */
 	handle->reqd_free_pages = reqd_free_pages();
 
-	printk(KERN_INFO
-		"PM: Using %u thread(s) for compression.\n"
-		"PM: Compressing and saving image data (%u pages)...\n",
-		nr_threads, nr_to_write);
+	pr_info("Using %u thread(s) for compression\n", nr_threads);
+	pr_info("Compressing and saving image data (%u pages)...\n",
+		nr_to_write);
 	m = nr_to_write / 10;
 	if (!m)
 		m = 1;
@@ -783,10 +782,8 @@ static int save_image_lzo(struct swap_map_handle *handle,
 				       data_of(*snapshot), PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_INFO
-					       "PM: Image saving progress: "
-					       "%3d%%\n",
-				               nr_pages / m * 10);
+					pr_info("Image saving progress: %3d%%\n",
+						nr_pages / m * 10);
 				nr_pages++;
 			}
 			if (!off)
@@ -813,15 +810,14 @@ static int save_image_lzo(struct swap_map_handle *handle,
 			ret = data[thr].ret;
 
 			if (ret < 0) {
-				printk(KERN_ERR "PM: LZO compression failed\n");
+				pr_err("LZO compression failed\n");
 				goto out_finish;
 			}
 
 			if (unlikely(!data[thr].cmp_len ||
 			             data[thr].cmp_len >
 			             lzo1x_worst_compress(data[thr].unc_len))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO compressed length\n");
+				pr_err("Invalid LZO compressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -857,7 +853,7 @@ out_finish:
 	if (!ret)
 		ret = err2;
 	if (!ret)
-		printk(KERN_INFO "PM: Image saving done.\n");
+		pr_info("Image saving done\n");
 	swsusp_show_speed(start, stop, nr_to_write, "Wrote");
 out_clean:
 	if (crc) {
@@ -888,7 +884,7 @@ static int enough_swap(unsigned int nr_pages, unsigned int flags)
 	unsigned int free_swap = count_swap_pages(root_swap, 1);
 	unsigned int required;
 
-	pr_debug("PM: Free swap pages: %u\n", free_swap);
+	pr_debug("Free swap pages: %u\n", free_swap);
 
 	required = PAGES_FOR_IO + nr_pages;
 	return free_swap > required;
@@ -915,12 +911,12 @@ int swsusp_write(unsigned int flags)
 	pages = snapshot_get_image_size();
 	error = get_swap_writer(&handle);
 	if (error) {
-		printk(KERN_ERR "PM: Cannot get swap writer\n");
+		pr_err("Cannot get swap writer\n");
 		return error;
 	}
 	if (flags & SF_NOCOMPRESS_MODE) {
 		if (!enough_swap(pages, flags)) {
-			printk(KERN_ERR "PM: Not enough free swap\n");
+			pr_err("Not enough free swap\n");
 			error = -ENOSPC;
 			goto out_finish;
 		}
@@ -1068,8 +1064,7 @@ static int load_image(struct swap_map_handle *handle,
 	hib_init_batch(&hb);
 
 	clean_pages_on_read = true;
-	printk(KERN_INFO "PM: Loading image data pages (%u pages)...\n",
-		nr_to_read);
+	pr_info("Loading image data pages (%u pages)...\n", nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
@@ -1087,8 +1082,8 @@ static int load_image(struct swap_map_handle *handle,
 		if (ret)
 			break;
 		if (!(nr_pages % m))
-			printk(KERN_INFO "PM: Image loading progress: %3d%%\n",
-			       nr_pages / m * 10);
+			pr_info("Image loading progress: %3d%%\n",
+				nr_pages / m * 10);
 		nr_pages++;
 	}
 	err2 = hib_wait_io(&hb);
@@ -1096,7 +1091,7 @@ static int load_image(struct swap_map_handle *handle,
 	if (!ret)
 		ret = err2;
 	if (!ret) {
-		printk(KERN_INFO "PM: Image loading done.\n");
+		pr_info("Image loading done\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
@@ -1190,14 +1185,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
 	if (!page) {
-		printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+		pr_err("Failed to allocate LZO page\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
 
 	data = vmalloc(sizeof(*data) * nr_threads);
 	if (!data) {
-		printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+		pr_err("Failed to allocate LZO data\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1206,7 +1201,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 
 	crc = kmalloc(sizeof(*crc), GFP_KERNEL);
 	if (!crc) {
-		printk(KERN_ERR "PM: Failed to allocate crc\n");
+		pr_err("Failed to allocate crc\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1226,8 +1221,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		                            "image_decompress/%u", thr);
 		if (IS_ERR(data[thr].thr)) {
 			data[thr].thr = NULL;
-			printk(KERN_ERR
-			       "PM: Cannot start decompression threads\n");
+			pr_err("Cannot start decompression threads\n");
 			ret = -ENOMEM;
 			goto out_clean;
 		}
@@ -1249,7 +1243,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
 	if (IS_ERR(crc->thr)) {
 		crc->thr = NULL;
-		printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+		pr_err("Cannot start CRC32 thread\n");
 		ret = -ENOMEM;
 		goto out_clean;
 	}
@@ -1274,8 +1268,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 		if (!page[i]) {
 			if (i < LZO_CMP_PAGES) {
 				ring_size = i;
-				printk(KERN_ERR
-				       "PM: Failed to allocate LZO pages\n");
+				pr_err("Failed to allocate LZO pages\n");
 				ret = -ENOMEM;
 				goto out_clean;
 			} else {
@@ -1285,10 +1278,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
 	}
 	want = ring_size = i;
 
-	printk(KERN_INFO
-		"PM: Using %u thread(s) for decompression.\n"
-		"PM: Loading and decompressing image data (%u pages)...\n",
-		nr_threads, nr_to_read);
+	pr_info("Using %u thread(s) for decompression\n", nr_threads);
+	pr_info("Loading and decompressing image data (%u pages)...\n",
+		nr_to_read);
 	m = nr_to_read / 10;
 	if (!m)
 		m = 1;
@@ -1348,8 +1340,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			if (unlikely(!data[thr].cmp_len ||
 			             data[thr].cmp_len >
 			             lzo1x_worst_compress(LZO_UNC_SIZE))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO compressed length\n");
+				pr_err("Invalid LZO compressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -1400,16 +1391,14 @@ static int load_image_lzo(struct swap_map_handle *handle,
 			ret = data[thr].ret;
 
 			if (ret < 0) {
-				printk(KERN_ERR
-				       "PM: LZO decompression failed\n");
+				pr_err("LZO decompression failed\n");
 				goto out_finish;
 			}
 
 			if (unlikely(!data[thr].unc_len ||
 			             data[thr].unc_len > LZO_UNC_SIZE ||
 			             data[thr].unc_len & (PAGE_SIZE - 1))) {
-				printk(KERN_ERR
-				       "PM: Invalid LZO uncompressed length\n");
+				pr_err("Invalid LZO uncompressed length\n");
 				ret = -1;
 				goto out_finish;
 			}
@@ -1420,10 +1409,8 @@ static int load_image_lzo(struct swap_map_handle *handle,
 				       data[thr].unc + off, PAGE_SIZE);
 
 				if (!(nr_pages % m))
-					printk(KERN_INFO
-					       "PM: Image loading progress: "
-					       "%3d%%\n",
-					       nr_pages / m * 10);
+					pr_info("Image loading progress: %3d%%\n",
+						nr_pages / m * 10);
 				nr_pages++;
 
 				ret = snapshot_write_next(snapshot);
@@ -1448,15 +1435,14 @@ out_finish:
 	}
 	stop = ktime_get();
 	if (!ret) {
-		printk(KERN_INFO "PM: Image loading done.\n");
+		pr_info("Image loading done\n");
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			ret = -ENODATA;
 		if (!ret) {
 			if (swsusp_header->flags & SF_CRC32_MODE) {
 				if(handle->crc32 != swsusp_header->crc32) {
-					printk(KERN_ERR
-					       "PM: Invalid image CRC32!\n");
+					pr_err("Invalid image CRC32!\n");
 					ret = -ENODATA;
 				}
 			}
@@ -1513,9 +1499,9 @@ int swsusp_read(unsigned int *flags_p)
 	swap_reader_finish(&handle);
 end:
 	if (!error)
-		pr_debug("PM: Image successfully loaded\n");
+		pr_debug("Image successfully loaded\n");
 	else
-		pr_debug("PM: Error %d resuming\n", error);
+		pr_debug("Error %d resuming\n", error);
 	return error;
 }
 
@@ -1552,13 +1538,13 @@ put:
 		if (error)
 			blkdev_put(hib_resume_bdev, FMODE_READ);
 		else
-			pr_debug("PM: Image signature found, resuming\n");
+			pr_debug("Image signature found, resuming\n");
 	} else {
 		error = PTR_ERR(hib_resume_bdev);
 	}
 
 	if (error)
-		pr_debug("PM: Image not found (code %d)\n", error);
+		pr_debug("Image not found (code %d)\n", error);
 
 	return error;
 }
@@ -1570,7 +1556,7 @@ put:
 void swsusp_close(fmode_t mode)
 {
 	if (IS_ERR(hib_resume_bdev)) {
-		pr_debug("PM: Image device not initialised\n");
+		pr_debug("Image device not initialised\n");
 		return;
 	}
 
@@ -1594,7 +1580,7 @@ int swsusp_unmark(void)
 					swsusp_resume_block,
 					swsusp_header, NULL);
 	} else {
-		printk(KERN_ERR "PM: Cannot find swsusp signature!\n");
+		pr_err("Cannot find swsusp signature!\n");
 		error = -ENODEV;
 	}
 
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 512f7c2baedd..b9006617710f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2190,7 +2190,7 @@ again:
 		}
 
 		if (console_seq < log_first_seq) {
-			len = sprintf(text, "** %u printk messages dropped ** ",
+			len = sprintf(text, "** %u printk messages dropped **\n",
 				      (unsigned)(log_first_seq - console_seq));
 
 			/* messages are gone, move to first one */
@@ -3141,9 +3141,6 @@ void dump_stack_print_info(const char *log_lvl)
 void show_regs_print_info(const char *log_lvl)
 {
 	dump_stack_print_info(log_lvl);
-
-	printk("%stask: %p task.stack: %p\n",
-	       log_lvl, current, task_stack_page(current));
 }
 
 #endif
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 3cdaeaef9ce1..3e3c2004bb23 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -39,7 +39,7 @@
  * There are situations when we want to make sure that all buffers
  * were handled or when IRQs are blocked.
  */
-static int printk_safe_irq_ready;
+static int printk_safe_irq_ready __read_mostly;
 
 #define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) -	\
 				sizeof(atomic_t) -			\
@@ -63,11 +63,8 @@ static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq);
 /* Get flushed in a more safe context. */
 static void queue_flush_work(struct printk_safe_seq_buf *s)
 {
-	if (printk_safe_irq_ready) {
-		/* Make sure that IRQ work is really initialized. */
-		smp_rmb();
+	if (printk_safe_irq_ready)
 		irq_work_queue(&s->work);
-	}
 }
 
 /*
@@ -75,7 +72,7 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
  * have dedicated buffers, because otherwise printk-safe preempted by
  * NMI-printk would have overwritten the NMI messages.
  *
- * The messages are fushed from irq work (or from panic()), possibly,
+ * The messages are flushed from irq work (or from panic()), possibly,
  * from other CPU, concurrently with printk_safe_log_store(). Should this
  * happen, printk_safe_log_store() will notice the buffer->len mismatch
  * and repeat the write.
@@ -398,8 +395,12 @@ void __init printk_safe_init(void)
 #endif
 	}
 
-	/* Make sure that IRQ works are initialized before enabling. */
-	smp_wmb();
+	/*
+	 * In the highly unlikely event that a NMI were to trigger at
+	 * this moment. Make sure IRQ work is set up before this
+	 * variable is set.
+	 */
+	barrier();
 	printk_safe_irq_ready = 1;
 
 	/* Flush pending messages that did not have scheduled IRQ works. */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..e4ced883d8de 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,33 @@ int unregister_reboot_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(unregister_reboot_notifier);
 
+static void devm_unregister_reboot_notifier(struct device *dev, void *res)
+{
+	WARN_ON(unregister_reboot_notifier(*(struct notifier_block **)res));
+}
+
+int devm_register_reboot_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct notifier_block **rcnb;
+	int ret;
+
+	rcnb = devres_alloc(devm_unregister_reboot_notifier,
+			    sizeof(*rcnb), GFP_KERNEL);
+	if (!rcnb)
+		return -ENOMEM;
+
+	ret = register_reboot_notifier(nb);
+	if (!ret) {
+		*rcnb = nb;
+		devres_add(dev, rcnb);
+	} else {
+		devres_free(rcnb);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(devm_register_reboot_notifier);
+
 /*
  *	Notifier list for kernel code which wants to be called
  *	to restart the system.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b82a0073532..75554f366fd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
 #include <linux/init_task.h>
 #include <linux/context_tracking.h>
 #include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
 
 #include <linux/blkdev.h>
 #include <linux/kprobes.h>
@@ -5107,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
  * an error code.
  */
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
-		struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
 	struct rq_flags rf;
-	struct timespec t;
 	struct rq *rq;
 	int retval;
 
@@ -5137,15 +5136,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 	task_rq_unlock(rq, p, &rf);
 
 	rcu_read_unlock();
-	jiffies_to_timespec(time_slice, &t);
-	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
-	return retval;
+	jiffies_to_timespec64(time_slice, t);
+	return 0;
 
 out_unlock:
 	rcu_read_unlock();
 	return retval;
 }
 
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct timespec64 t;
+	int retval = sched_rr_get_interval(pid, &t);
+
+	if (retval == 0)
+		retval = put_timespec64(&t, interval);
+
+	return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+		       compat_pid_t, pid,
+		       struct compat_timespec __user *, interval)
+{
+	struct timespec64 t;
+	int retval = sched_rr_get_interval(pid, &t);
+
+	if (retval == 0)
+		retval = compat_put_timespec64(&t, interval);
+	return retval;
+}
+#endif
+
 void sched_show_task(struct task_struct *p)
 {
 	unsigned long free = 0;
@@ -6620,7 +6644,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
 	return ret;
 }
 
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
 {
 	struct task_group *tg = css_tg(seq_css(sf));
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6660,7 +6684,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
 		.name = "shares",
@@ -6681,7 +6705,7 @@ static struct cftype cpu_files[] = {
 	},
 	{
 		.name = "stat",
-		.seq_show = cpu_stats_show,
+		.seq_show = cpu_cfs_stat_show,
 	},
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -6699,16 +6723,182 @@ static struct cftype cpu_files[] = {
 	{ }	/* Terminate */
 };
 
+static int cpu_extra_stat_show(struct seq_file *sf,
+			       struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		struct task_group *tg = css_tg(css);
+		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+		u64 throttled_usec;
+
+		throttled_usec = cfs_b->throttled_time;
+		do_div(throttled_usec, NSEC_PER_USEC);
+
+		seq_printf(sf, "nr_periods %d\n"
+			   "nr_throttled %d\n"
+			   "throttled_usec %llu\n",
+			   cfs_b->nr_periods, cfs_b->nr_throttled,
+			   throttled_usec);
+	}
+#endif
+	return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct task_group *tg = css_tg(css);
+	u64 weight = scale_load_down(tg->shares);
+
+	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 weight)
+{
+	/*
+	 * cgroup weight knobs should use the common MIN, DFL and MAX
+	 * values which are 1, 100 and 10000 respectively.  While it loses
+	 * a bit of range on both ends, it maps pretty well onto the shares
+	 * value used by scheduler and the round-trip conversions preserve
+	 * the original value over the entire range.
+	 */
+	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+		return -ERANGE;
+
+	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+				    struct cftype *cft)
+{
+	unsigned long weight = scale_load_down(css_tg(css)->shares);
+	int last_delta = INT_MAX;
+	int prio, delta;
+
+	/* find the closest nice value to the current weight */
+	for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+		delta = abs(sched_prio_to_weight[prio] - weight);
+		if (delta >= last_delta)
+			break;
+		last_delta = delta;
+	}
+
+	return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+				     struct cftype *cft, s64 nice)
+{
+	unsigned long weight;
+
+	if (nice < MIN_NICE || nice > MAX_NICE)
+		return -ERANGE;
+
+	weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+	return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+						  long period, long quota)
+{
+	if (quota < 0)
+		seq_puts(sf, "max");
+	else
+		seq_printf(sf, "%ld", quota);
+
+	seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+						 u64 *periodp, u64 *quotap)
+{
+	char tok[21];	/* U64_MAX */
+
+	if (!sscanf(buf, "%s %llu", tok, periodp))
+		return -EINVAL;
+
+	*periodp *= NSEC_PER_USEC;
+
+	if (sscanf(tok, "%llu", quotap))
+		*quotap *= NSEC_PER_USEC;
+	else if (!strcmp(tok, "max"))
+		*quotap = RUNTIME_INF;
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+	struct task_group *tg = css_tg(seq_css(sf));
+
+	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+	return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+			     char *buf, size_t nbytes, loff_t off)
+{
+	struct task_group *tg = css_tg(of_css(of));
+	u64 period = tg_get_cfs_period(tg);
+	u64 quota;
+	int ret;
+
+	ret = cpu_period_quota_parse(buf, &period, &quota);
+	if (!ret)
+		ret = tg_set_cfs_bandwidth(tg, period, quota);
+	return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	{
+		.name = "weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = cpu_weight_read_u64,
+		.write_u64 = cpu_weight_write_u64,
+	},
+	{
+		.name = "weight.nice",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_s64 = cpu_weight_nice_read_s64,
+		.write_s64 = cpu_weight_nice_write_s64,
+	},
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+	{
+		.name = "max",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = cpu_max_show,
+		.write = cpu_max_write,
+	},
+#endif
+	{ }	/* terminate */
+};
+
 struct cgroup_subsys cpu_cgrp_subsys = {
 	.css_alloc	= cpu_cgroup_css_alloc,
 	.css_online	= cpu_cgroup_css_online,
 	.css_released	= cpu_cgroup_css_released,
 	.css_free	= cpu_cgroup_css_free,
+	.css_extra_stat_show = cpu_extra_stat_show,
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.legacy_cftypes	= cpu_files,
+	.legacy_cftypes	= cpu_legacy_files,
+	.dfl_cftypes	= cpu_files,
 	.early_init	= true,
+	.threaded	= true,
 };
 
 #endif	/* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index a8358a57a316..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index ba0da243fdd8..2f52ec0f1539 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
 		 * Do not reduce the frequency if the CPU has not been idle
 		 * recently, as the reduction is likely to be premature then.
 		 */
-		if (busy && next_f < sg_policy->next_freq)
+		if (busy && next_f < sg_policy->next_freq) {
 			next_f = sg_policy->next_freq;
+
+			/* Reset cached freq as next_freq has changed */
+			sg_policy->cached_raw_freq = 0;
+		}
 	}
 	sugov_update_commit(sg_policy, time, next_f);
 }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9be8b68a66da..bac6ac9a4ec7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 	 */
 	__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
 
-	cpuacct_account_field(p, index, tmp);
+	cgroup_account_cputime_field(p, index, tmp);
 }
 
 /*
@@ -446,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
 
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
+{
+	*ut = curr->utime;
+	*st = curr->stime;
+}
+
 void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
 {
 	*ut = p->utime;
@@ -584,9 +591,8 @@ drop_precision:
  *
  * Assuming that rtime_i+1 >= rtime_i.
  */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct prev_cputime *prev,
-			   u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+		    u64 *ut, u64 *st)
 {
 	u64 rtime, stime, utime;
 	unsigned long flags;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index f349f7e98dec..2473736c7616 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0989676c50e9..4037e19bbca2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -844,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		struct task_struct *curtask = task_of(curr);
 
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-		cpuacct_charge(curtask, delta_exec);
+		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d8c43d73e078..4056c19ca3f0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -969,7 +969,7 @@ static void update_curr_rt(struct rq *rq)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 
 	sched_rt_avg_update(rq, delta_exec);
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 45ab0bf564e7..b19552a212de 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -30,6 +30,7 @@
 #include <linux/irq_work.h>
 #include <linux/tick.h>
 #include <linux/slab.h>
+#include <linux/cgroup.h>
 
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
@@ -37,7 +38,6 @@
 
 #include "cpupri.h"
 #include "cpudeadline.h"
-#include "cpuacct.h"
 
 #ifdef CONFIG_SCHED_DEBUG
 # define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 45caf90b24cd..210b1f2146ff 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -72,7 +72,7 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
 	account_group_exec_runtime(curr, delta_exec);
 
 	curr->se.exec_start = rq_clock_task(rq);
-	cpuacct_charge(curr, delta_exec);
+	cgroup_account_cputime(curr, delta_exec);
 }
 
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index f8159698aa4d..84cb3acd9260 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -183,7 +183,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
  */
 static __sched
 int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_entry,
-		       int (*action)(atomic_t *), unsigned mode)
+		       wait_atomic_t_action_f action, unsigned int mode)
 {
 	atomic_t *val;
 	int ret = 0;
@@ -193,7 +193,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		val = wbq_entry->key.flags;
 		if (atomic_read(val) == 0)
 			break;
-		ret = (*action)(val);
+		ret = (*action)(val, mode);
 	} while (!ret && atomic_read(val) != 0);
 	finish_wait(wq_head, &wbq_entry->wq_entry);
 	return ret;
@@ -210,8 +210,9 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
 		},							\
 	}
 
-__sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
-					 unsigned mode)
+__sched int out_of_line_wait_on_atomic_t(atomic_t *p,
+					 wait_atomic_t_action_f action,
+					 unsigned int mode)
 {
 	struct wait_queue_head *wq_head = atomic_t_waitqueue(p);
 	DEFINE_WAIT_ATOMIC_T(wq_entry, p);
@@ -220,6 +221,15 @@ __sched int out_of_line_wait_on_atomic_t(atomic_t *p, int (*action)(atomic_t *),
 }
 EXPORT_SYMBOL(out_of_line_wait_on_atomic_t);
 
+__sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
+{
+	schedule();
+	if (signal_pending_state(mode, current))
+		return -EINTR;
+	return 0;
+}
+EXPORT_SYMBOL(atomic_t_wait);
+
 /**
  * wake_up_atomic_t - Wake up a waiter on a atomic_t
  * @p: The atomic_t being waited on, a kernel virtual address
diff --git a/kernel/signal.c b/kernel/signal.c
index 8dcd8825b2de..9558664bd9ec 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -78,7 +78,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL && !force)
+	    handler == SIG_DFL && !(force && sig_kernel_only(sig)))
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
@@ -94,13 +94,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig, force))
-		return 0;
-
 	/*
-	 * Tracers may want to know about even ignored signals.
+	 * Tracers may want to know about even ignored signal unless it
+	 * is SIGKILL which can't be reported anyway but can be ignored
+	 * by SIGNAL_UNKILLABLE task.
 	 */
-	return !t->ptrace;
+	if (t->ptrace && sig != SIGKILL)
+		return 0;
+
+	return sig_task_ignored(t, sig, force);
 }
 
 /*
@@ -929,9 +931,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	 * then start taking the whole group down immediately.
 	 */
 	if (sig_fatal(p, sig) &&
-	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+	    !(signal->flags & SIGNAL_GROUP_EXIT) &&
 	    !sigismember(&t->real_blocked, sig) &&
-	    (sig == SIGKILL || !t->ptrace)) {
+	    (sig == SIGKILL || !p->ptrace)) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
@@ -1036,8 +1038,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	else
 		override_rlimit = 0;
 
-	q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE,
-		override_rlimit);
+	q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
 	if (q) {
 		list_add_tail(&q->list, &pending->list);
 		switch ((unsigned long) info) {
@@ -2600,7 +2601,6 @@ SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
 COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		compat_sigset_t __user *, oset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t old_set = current->blocked;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
@@ -2608,38 +2608,22 @@ COMPAT_SYSCALL_DEFINE4(rt_sigprocmask, int, how, compat_sigset_t __user *, nset,
 		return -EINVAL;
 
 	if (nset) {
-		compat_sigset_t new32;
 		sigset_t new_set;
 		int error;
-		if (copy_from_user(&new32, nset, sizeof(compat_sigset_t)))
+		if (get_compat_sigset(&new_set, nset))
 			return -EFAULT;
-
-		sigset_from_compat(&new_set, &new32);
 		sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
 
 		error = sigprocmask(how, &new_set, NULL);
 		if (error)
 			return error;
 	}
-	if (oset) {
-		compat_sigset_t old32;
-		sigset_to_compat(&old32, &old_set);
-		if (copy_to_user(oset, &old32, sizeof(compat_sigset_t)))
-			return -EFAULT;
-	}
-	return 0;
-#else
-	return sys_rt_sigprocmask(how, (sigset_t __user *)nset,
-				  (sigset_t __user *)oset, sigsetsize);
-#endif
+	return oset ? put_compat_sigset(oset, &old_set, sizeof(*oset)) : 0;
 }
 #endif
 
-static int do_sigpending(void *set, unsigned long sigsetsize)
+static int do_sigpending(sigset_t *set)
 {
-	if (sigsetsize > sizeof(sigset_t))
-		return -EINVAL;
-
 	spin_lock_irq(&current->sighand->siglock);
 	sigorsets(set, &current->pending.signal,
 		  &current->signal->shared_pending.signal);
@@ -2659,7 +2643,12 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
 SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 {
 	sigset_t set;
-	int err = do_sigpending(&set, sigsetsize);
+	int err;
+
+	if (sigsetsize > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
 	if (!err && copy_to_user(uset, &set, sigsetsize))
 		err = -EFAULT;
 	return err;
@@ -2669,20 +2658,16 @@ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
 COMPAT_SYSCALL_DEFINE2(rt_sigpending, compat_sigset_t __user *, uset,
 		compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
-	int err = do_sigpending(&set, sigsetsize);
-	if (!err) {
-		compat_sigset_t set32;
-		sigset_to_compat(&set32, &set);
-		/* we can get here only if sigsetsize <= sizeof(set) */
-		if (copy_to_user(uset, &set32, sigsetsize))
-			err = -EFAULT;
-	}
+	int err;
+
+	if (sigsetsize > sizeof(*uset))
+		return -EINVAL;
+
+	err = do_sigpending(&set);
+	if (!err)
+		err = put_compat_sigset(uset, &set, sigsetsize);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)uset, sigsetsize);
-#endif
 }
 #endif
 
@@ -2916,7 +2901,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 		struct compat_siginfo __user *, uinfo,
 		struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
 {
-	compat_sigset_t s32;
 	sigset_t s;
 	struct timespec t;
 	siginfo_t info;
@@ -2925,9 +2909,8 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&s, uthese))
 		return -EFAULT;
-	sigset_from_compat(&s, &s32);
 
 	if (uts) {
 		if (compat_get_timespec(&t, uts))
@@ -3345,15 +3328,11 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t set;
-	int err = do_sigpending(&set, sizeof(set.sig[0]));
+	int err = do_sigpending(&set);
 	if (!err)
 		err = put_user(set.sig[0], set32);
 	return err;
-#else
-	return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
-#endif
 }
 #endif
 
@@ -3451,7 +3430,6 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		compat_size_t, sigsetsize)
 {
 	struct k_sigaction new_ka, old_ka;
-	compat_sigset_t mask;
 #ifdef __ARCH_HAS_SA_RESTORER
 	compat_uptr_t restorer;
 #endif
@@ -3469,19 +3447,18 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		ret |= get_user(restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(restorer);
 #endif
-		ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
+		ret |= get_compat_sigset(&new_ka.sa.sa_mask, &act->sa_mask);
 		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		if (ret)
 			return -EFAULT;
-		sigset_from_compat(&new_ka.sa.sa_mask, &mask);
 	}
 
 	ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL);
 	if (!ret && oact) {
-		sigset_to_compat(&mask, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
 			       &oact->sa_handler);
-		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
+		ret |= put_compat_sigset(&oact->sa_mask, &old_ka.sa.sa_mask,
+					 sizeof(oact->sa_mask));
 		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
@@ -3661,22 +3638,15 @@ SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
 #ifdef CONFIG_COMPAT
 COMPAT_SYSCALL_DEFINE2(rt_sigsuspend, compat_sigset_t __user *, unewset, compat_size_t, sigsetsize)
 {
-#ifdef __BIG_ENDIAN
 	sigset_t newset;
-	compat_sigset_t newset32;
 
 	/* XXX: Don't preclude handling different sized sigset_t's.  */
 	if (sigsetsize != sizeof(sigset_t))
 		return -EINVAL;
 
-	if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t)))
+	if (get_compat_sigset(&newset, unewset))
 		return -EFAULT;
-	sigset_from_compat(&newset, &newset32);
 	return sigsuspend(&newset);
-#else
-	/* on little-endian bitmaps don't care about granularity */
-	return sys_rt_sigsuspend((sigset_t __user *)unewset, sigsetsize);
-#endif
 }
 #endif
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 662f7b1b7a78..2f5e87f1bae2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t)
 }
 EXPORT_SYMBOL(__tasklet_hi_schedule);
 
-void __tasklet_hi_schedule_first(struct tasklet_struct *t)
-{
-	lockdep_assert_irqs_disabled();
-
-	t->next = __this_cpu_read(tasklet_hi_vec.head);
-	__this_cpu_write(tasklet_hi_vec.head, t);
-	__raise_softirq_irqoff(HI_SOFTIRQ);
-}
-EXPORT_SYMBOL(__tasklet_hi_schedule_first);
-
 static __latent_entropy void tasklet_action(struct softirq_action *a)
 {
 	struct tasklet_struct *list;
diff --git a/kernel/sys.c b/kernel/sys.c
index 524a4cb9bbe2..83ffd7dccf23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -111,6 +111,12 @@
 #ifndef SET_FP_MODE
 # define SET_FP_MODE(a,b)	(-EINVAL)
 #endif
+#ifndef SVE_SET_VL
+# define SVE_SET_VL(a)		(-EINVAL)
+#endif
+#ifndef SVE_GET_VL
+# define SVE_GET_VL()		(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2386,6 +2392,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_SVE_SET_VL:
+		error = SVE_SET_VL(arg2);
+		break;
+	case PR_SVE_GET_VL:
+		error = SVE_GET_VL();
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d9c31bc2eaea..557d46728577 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -30,7 +30,6 @@
 #include <linux/proc_fs.h>
 #include <linux/security.h>
 #include <linux/ctype.h>
-#include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/fs.h>
 #include <linux/init.h>
@@ -67,6 +66,7 @@
 #include <linux/kexec.h>
 #include <linux/bpf.h>
 #include <linux/mount.h>
+#include <linux/pipe_fs_i.h>
 
 #include <linux/uaccess.h>
 #include <asm/processor.h>
@@ -1174,15 +1174,6 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &one_thousand,
 	},
 #endif
-#ifdef CONFIG_KMEMCHECK
-	{
-		.procname	= "kmemcheck",
-		.data		= &kmemcheck_enabled,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
-	},
-#endif
 	{
 		.procname	= "panic_on_warn",
 		.data		= &panic_on_warn,
@@ -1342,11 +1333,6 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
-		.procname       = "nr_pdflush_threads",
-		.mode           = 0444 /* read-only */,
-		.proc_handler   = pdflush_proc_obsolete,
-	},
-	{
 		.procname	= "swappiness",
 		.data		= &vm_swappiness,
 		.maxlen		= sizeof(vm_swappiness),
@@ -1371,6 +1357,15 @@ static struct ctl_table vm_table[] = {
 		.mode           = 0644,
 		.proc_handler   = &hugetlb_mempolicy_sysctl_handler,
 	},
+	{
+		.procname		= "numa_stat",
+		.data			= &sysctl_vm_numa_stat,
+		.maxlen			= sizeof(int),
+		.mode			= 0644,
+		.proc_handler	= sysctl_vm_numa_stat_handler,
+		.extra1			= &zero,
+		.extra2			= &one,
+	},
 #endif
 	 {
 		.procname	= "hugetlb_shm_group",
@@ -1822,7 +1817,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(pipe_max_size),
 		.mode		= 0644,
 		.proc_handler	= &pipe_proc_fn,
 		.extra1		= &pipe_min_size,
@@ -2581,12 +2576,13 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
 	if (write) {
 		unsigned int val = *lvalp;
 
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
 		if ((param->min && *param->min > val) ||
 		    (param->max && *param->max < val))
 			return -ERANGE;
 
-		if (*lvalp > UINT_MAX)
-			return -EINVAL;
 		*valp = val;
 	} else {
 		unsigned int val = *valp;
@@ -2626,6 +2622,48 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 				 do_proc_douintvec_minmax_conv, &param);
 }
 
+struct do_proc_dopipe_max_size_conv_param {
+	unsigned int *min;
+};
+
+static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
+					unsigned int *valp,
+					int write, void *data)
+{
+	struct do_proc_dopipe_max_size_conv_param *param = data;
+
+	if (write) {
+		unsigned int val;
+
+		if (*lvalp > UINT_MAX)
+			return -EINVAL;
+
+		val = round_pipe_size(*lvalp);
+		if (val == 0)
+			return -EINVAL;
+
+		if (param->min && *param->min > val)
+			return -ERANGE;
+
+		*valp = val;
+	} else {
+		unsigned int val = *valp;
+		*lvalp = (unsigned long) val;
+	}
+
+	return 0;
+}
+
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct do_proc_dopipe_max_size_conv_param param = {
+		.min = (unsigned int *) table->extra1,
+	};
+	return do_proc_douintvec(table, write, buffer, lenp, ppos,
+				 do_proc_dopipe_max_size_conv, &param);
+}
+
 static void validate_coredump_safety(void)
 {
 #ifdef CONFIG_COREDUMP
@@ -3089,14 +3127,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			else
 				bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
 		}
-		kfree(tmp_bitmap);
 		*lenp -= left;
 		*ppos += *lenp;
-		return 0;
-	} else {
-		kfree(tmp_bitmap);
-		return err;
 	}
+
+	kfree(tmp_bitmap);
+	return err;
 }
 
 #else /* CONFIG_PROC_SYSCTL */
@@ -3131,6 +3167,12 @@ int proc_douintvec_minmax(struct ctl_table *table, int write,
 	return -ENOSYS;
 }
 
+int proc_dopipe_max_size(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	return -ENOSYS;
+}
+
 int proc_dointvec_jiffies(struct ctl_table *table, int write,
 		    void __user *buffer, size_t *lenp, loff_t *ppos)
 {
@@ -3174,6 +3216,7 @@ EXPORT_SYMBOL(proc_douintvec);
 EXPORT_SYMBOL(proc_dointvec_jiffies);
 EXPORT_SYMBOL(proc_dointvec_minmax);
 EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
+EXPORT_SYMBOL_GPL(proc_dopipe_max_size);
 EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
 EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
 EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index d689a9557e17..e776fc8cc1df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -21,10 +21,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE
 config GENERIC_TIME_VSYSCALL
 	bool
 
-# Timekeeping vsyscall support
-config GENERIC_TIME_VSYSCALL_OLD
-	bool
-
 # Old style timekeeping
 config ARCH_USES_GETTIMEOFFSET
 	bool
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 03918a19cf2d..65f9e3f24dde 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -171,7 +171,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
 	spin_unlock_irqrestore(&watchdog_lock, flags);
 }
 
-static void clocksource_watchdog(unsigned long data)
+static void clocksource_watchdog(struct timer_list *unused)
 {
 	struct clocksource *cs;
 	u64 csnow, wdnow, cslast, wdlast, delta;
@@ -290,8 +290,7 @@ static inline void clocksource_start_watchdog(void)
 {
 	if (watchdog_running || !watchdog || list_empty(&watchdog_list))
 		return;
-	init_timer(&watchdog_timer);
-	watchdog_timer.function = clocksource_watchdog;
+	timer_setup(&watchdog_timer, clocksource_watchdog, 0);
 	watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
 	add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
 	watchdog_running = 1;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 198afa78bf69..cd03317e7b57 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -557,45 +557,6 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
 	update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
 }
 
-#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
-#warning Please contact your maintainers, as GENERIC_TIME_VSYSCALL_OLD compatibity will disappear soon.
-
-static inline void update_vsyscall(struct timekeeper *tk)
-{
-	struct timespec xt, wm;
-
-	xt = timespec64_to_timespec(tk_xtime(tk));
-	wm = timespec64_to_timespec(tk->wall_to_monotonic);
-	update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
-			    tk->tkr_mono.cycle_last);
-}
-
-static inline void old_vsyscall_fixup(struct timekeeper *tk)
-{
-	s64 remainder;
-
-	/*
-	* Store only full nanoseconds into xtime_nsec after rounding
-	* it up and add the remainder to the error difference.
-	* XXX - This is necessary to avoid small 1ns inconsistnecies caused
-	* by truncating the remainder in vsyscalls. However, it causes
-	* additional work to be done in timekeeping_adjust(). Once
-	* the vsyscall implementations are converted to use xtime_nsec
-	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
-	* users are removed, this can be killed.
-	*/
-	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
-	if (remainder != 0) {
-		tk->tkr_mono.xtime_nsec -= remainder;
-		tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
-		tk->ntp_error += remainder << tk->ntp_error_shift;
-		tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
-	}
-}
-#else
-#define old_vsyscall_fixup(tk)
-#endif
-
 static RAW_NOTIFIER_HEAD(pvclock_gtod_chain);
 
 static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
@@ -2164,12 +2125,6 @@ void update_wall_time(void)
 	timekeeping_adjust(tk, offset);
 
 	/*
-	 * XXX This can be killed once everyone converts
-	 * to the new update_vsyscall.
-	 */
-	old_vsyscall_fixup(tk);
-
-	/*
 	 * Finally, make sure that after the rounding
 	 * xtime_nsec isn't larger than NSEC_PER_SEC
 	 */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af0b8bae4502..ffebcf878fba 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -707,14 +707,18 @@ static inline void debug_timer_assert_init(struct timer_list *timer)
 	debug_object_assert_init(timer, &timer_debug_descr);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key);
 
-void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags,
+void init_timer_on_stack_key(struct timer_list *timer,
+			     void (*func)(struct timer_list *),
+			     unsigned int flags,
 			     const char *name, struct lock_class_key *key)
 {
 	debug_object_init_on_stack(timer, &timer_debug_descr);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
 
@@ -755,10 +759,13 @@ static inline void debug_assert_init(struct timer_list *timer)
 	debug_timer_assert_init(timer);
 }
 
-static void do_init_timer(struct timer_list *timer, unsigned int flags,
+static void do_init_timer(struct timer_list *timer,
+			  void (*func)(struct timer_list *),
+			  unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
 	timer->entry.pprev = NULL;
+	timer->function = func;
 	timer->flags = flags | raw_smp_processor_id();
 	lockdep_init_map(&timer->lockdep_map, name, key, 0);
 }
@@ -766,6 +773,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 /**
  * init_timer_key - initialize a timer
  * @timer: the timer to be initialized
+ * @func: timer callback function
  * @flags: timer flags
  * @name: name of the timer
  * @key: lockdep class key of the fake lock used for tracking timer
@@ -774,11 +782,12 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
  * init_timer_key() must be done to a timer prior calling *any* of the
  * other timer functions.
  */
-void init_timer_key(struct timer_list *timer, unsigned int flags,
+void init_timer_key(struct timer_list *timer,
+		    void (*func)(struct timer_list *), unsigned int flags,
 		    const char *name, struct lock_class_key *key)
 {
 	debug_init(timer);
-	do_init_timer(timer, flags, name, key);
+	do_init_timer(timer, func, flags, name, key);
 }
 EXPORT_SYMBOL(init_timer_key);
 
@@ -1107,12 +1116,12 @@ EXPORT_SYMBOL(timer_reduce);
  * add_timer - start a timer
  * @timer: the timer to be added
  *
- * The kernel will do a ->function(->data) callback from the
+ * The kernel will do a ->function(@timer) callback from the
  * timer interrupt at the ->expires point in the future. The
  * current time is 'jiffies'.
  *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
+ * The timer's ->expires, ->function fields must be set prior calling this
+ * function.
  *
  * Timers with an ->expires field in the past will be executed in the next
  * timer tick.
@@ -1284,8 +1293,7 @@ int del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
-static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
-			  unsigned long data)
+static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *))
 {
 	int count = preempt_count();
 
@@ -1309,7 +1317,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
 	lock_map_acquire(&lockdep_map);
 
 	trace_timer_expire_entry(timer);
-	fn(data);
+	fn(timer);
 	trace_timer_expire_exit(timer);
 
 	lock_map_release(&lockdep_map);
@@ -1331,8 +1339,7 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 {
 	while (!hlist_empty(head)) {
 		struct timer_list *timer;
-		void (*fn)(unsigned long);
-		unsigned long data;
+		void (*fn)(struct timer_list *);
 
 		timer = hlist_entry(head->first, struct timer_list, entry);
 
@@ -1340,15 +1347,14 @@ static void expire_timers(struct timer_base *base, struct hlist_head *head)
 		detach_timer(timer, true);
 
 		fn = timer->function;
-		data = timer->data;
 
 		if (timer->flags & TIMER_IRQSAFE) {
 			raw_spin_unlock(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock(&base->lock);
 		} else {
 			raw_spin_unlock_irq(&base->lock);
-			call_timer_fn(timer, fn, data);
+			call_timer_fn(timer, fn);
 			raw_spin_lock_irq(&base->lock);
 		}
 	}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 0e7f5428a148..0ed768b56c60 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -389,7 +389,7 @@ static int __init init_timer_list_procfs(void)
 {
 	struct proc_dir_entry *pe;
 
-	pe = proc_create("timer_list", 0444, NULL, &timer_list_fops);
+	pe = proc_create("timer_list", 0400, NULL, &timer_list_fops);
 	if (!pe)
 		return -ENOMEM;
 	return 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 434c840e2d82..af7dad126c13 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -160,6 +160,17 @@ config FUNCTION_GRAPH_TRACER
 	  address on the current task structure into a stack of calls.
 
 
+config PREEMPTIRQ_EVENTS
+	bool "Enable trace events for preempt and irq disable/enable"
+	select TRACE_IRQFLAGS
+	depends on DEBUG_PREEMPT || !PROVE_LOCKING
+	default n
+	help
+	  Enable tracing of disable and enable events for preemption and irqs.
+	  For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+	  enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+	  be disabled.
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
@@ -224,7 +235,7 @@ config HWLAT_TRACER
 	select GENERIC_TRACER
 	help
 	 This tracer, when enabled will create one or more kernel threads,
-	 depening on what the cpumask file is set to, which each thread
+	 depending on what the cpumask file is set to, which each thread
 	 spinning in a loop looking for interruptions caused by
 	 something other than the kernel. For example, if a
 	 System Management Interrupt (SMI) takes a noticeable amount of
@@ -239,7 +250,7 @@ config HWLAT_TRACER
 				     iteration
 
 	 A kernel thread is created that will spin with interrupts disabled
-	 for "width" microseconds in every "widow" cycle. It will not spin
+	 for "width" microseconds in every "window" cycle. It will not spin
 	 for "window - width" microseconds, where the system can
 	 continue to operate.
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 19a15b2f1190..e2538c7638d4 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_TRACING_MAP) += tracing_map.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 45a3928544ce..987d9a9ae283 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -66,7 +66,8 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+static DEFINE_MUTEX(blk_probe_mutex);
+static int blk_probes_ref;
 
 static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
@@ -329,14 +330,29 @@ static void blk_trace_free(struct blk_trace *bt)
 	kfree(bt);
 }
 
+static void get_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (++blk_probes_ref == 1)
+		blk_register_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void put_probe_ref(void)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (!--blk_probes_ref)
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	blk_trace_free(bt);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
+	put_probe_ref();
 }
 
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
 {
 	struct blk_trace *bt;
 
@@ -349,6 +365,17 @@ int blk_trace_remove(struct request_queue *q)
 
 	return 0;
 }
+
+int blk_trace_remove(struct request_queue *q)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_remove(q);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_remove);
 
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -538,8 +565,7 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto err;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 
 	ret = 0;
 err:
@@ -550,9 +576,8 @@ err:
 	return ret;
 }
 
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		    struct block_device *bdev,
-		    char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			     struct block_device *bdev, char __user *arg)
 {
 	struct blk_user_trace_setup buts;
 	int ret;
@@ -566,11 +591,24 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		return ret;
 
 	if (copy_to_user(arg, &buts, sizeof(buts))) {
-		blk_trace_remove(q);
+		__blk_trace_remove(q);
 		return -EFAULT;
 	}
 	return 0;
 }
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
+		    char __user *arg)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_setup(q, name, dev, bdev, arg);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -599,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 		return ret;
 
 	if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
-		blk_trace_remove(q);
+		__blk_trace_remove(q);
 		return -EFAULT;
 	}
 
@@ -607,7 +645,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
 }
 #endif
 
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
 	struct blk_trace *bt = q->blk_trace;
@@ -646,6 +684,17 @@ int blk_trace_startstop(struct request_queue *q, int start)
 
 	return ret;
 }
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+
+	mutex_lock(&q->blk_trace_mutex);
+	ret = __blk_trace_startstop(q, start);
+	mutex_unlock(&q->blk_trace_mutex);
+
+	return ret;
+}
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 
 /*
@@ -676,7 +725,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+		ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 	case BLKTRACESETUP32:
@@ -687,10 +736,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
-		ret = blk_trace_startstop(q, start);
+		ret = __blk_trace_startstop(q, start);
 		break;
 	case BLKTRACETEARDOWN:
-		ret = blk_trace_remove(q);
+		ret = __blk_trace_remove(q);
 		break;
 	default:
 		ret = -ENOTTY;
@@ -708,10 +757,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
+	mutex_lock(&q->blk_trace_mutex);
+
 	if (q->blk_trace) {
-		blk_trace_startstop(q, 0);
-		blk_trace_remove(q);
+		__blk_trace_startstop(q, 0);
+		__blk_trace_remove(q);
 	}
+
+	mutex_unlock(&q->blk_trace_mutex);
 }
 
 #ifdef CONFIG_BLK_CGROUP
@@ -819,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-			      u32 what, int error, union kernfs_node_id *cgid)
+			      u32 what, int error)
 {
 	struct blk_trace *bt = q->blk_trace;
 
@@ -827,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 		return;
 
 	__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
-			bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid);
+			bio_op(bio), bio->bi_opf, what, error, 0, NULL,
+			blk_trace_bio_get_cgid(q, bio));
 }
 
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0,
-			  blk_trace_bio_get_cgid(q, bio));
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error,
-			  blk_trace_bio_get_cgid(q, bio));
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 
 static void blk_add_trace_bio_backmerge(void *ignore,
@@ -850,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request *rq,
 					struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0,
-			 blk_trace_bio_get_cgid(q, bio));
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 
 static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -859,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request *rq,
 					 struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0,
-			  blk_trace_bio_get_cgid(q, bio));
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0,
-			  blk_trace_bio_get_cgid(q, bio));
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 
 static void blk_add_trace_getrq(void *ignore,
@@ -875,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore,
 				struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0,
-				  blk_trace_bio_get_cgid(q, bio));
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
@@ -892,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore,
 				  struct bio *bio, int rw)
 {
 	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0,
-				  blk_trace_bio_get_cgid(q, bio));
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 
@@ -1558,9 +1605,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-
+	put_probe_ref();
 	blk_trace_free(bt);
 	return 0;
 }
@@ -1591,8 +1636,7 @@ static int blk_trace_setup_queue(struct request_queue *q,
 	if (cmpxchg(&q->blk_trace, NULL, bt))
 		goto free_bt;
 
-	if (atomic_inc_return(&blk_probes_ref) == 1)
-		blk_register_tracepoints();
+	get_probe_ref();
 	return 0;
 
 free_bt:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 95888ae6c263..0ce99c379c30 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -15,9 +15,11 @@
 #include <linux/ctype.h>
 #include "trace.h"
 
+u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
+
 /**
  * trace_call_bpf - invoke BPF program
- * @prog: BPF program
+ * @call: tracepoint event
  * @ctx: opaque context pointer
  *
  * kprobe handlers execute BPF programs via this helper.
@@ -29,7 +31,7 @@
  * 1 - store kprobe event into ring buffer
  * Other values are reserved and currently alias to 1
  */
-unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
+unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
 {
 	unsigned int ret;
 
@@ -49,9 +51,22 @@ unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx)
 		goto out;
 	}
 
-	rcu_read_lock();
-	ret = BPF_PROG_RUN(prog, ctx);
-	rcu_read_unlock();
+	/*
+	 * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock
+	 * to all call sites, we did a bpf_prog_array_valid() there to check
+	 * whether call->prog_array is empty or not, which is
+	 * a heurisitc to speed up execution.
+	 *
+	 * If bpf_prog_array_valid() fetched prog_array was
+	 * non-NULL, we go into trace_call_bpf() and do the actual
+	 * proper rcu_dereference() under RCU lock.
+	 * If it turns out that prog_array is NULL then, we bail out.
+	 * For the opposite, if the bpf_prog_array_valid() fetched pointer
+	 * was NULL, you'll skip the prog_array with the risk of missing
+	 * out of events when it was updated in between this and the
+	 * rcu_dereference() which is accepted risk.
+	 */
+	ret = BPF_PROG_RUN_ARRAY_CHECK(call->prog_array, ctx, BPF_PROG_RUN);
 
  out:
 	__this_cpu_dec(bpf_prog_active);
@@ -77,7 +92,7 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
-	.arg2_type	= ARG_CONST_SIZE,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
 };
 
@@ -255,14 +270,14 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
-BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+static __always_inline int
+get_map_perf_counter(struct bpf_map *map, u64 flags,
+		     u64 *value, u64 *enabled, u64 *running)
 {
 	struct bpf_array *array = container_of(map, struct bpf_array, map);
 	unsigned int cpu = smp_processor_id();
 	u64 index = flags & BPF_F_INDEX_MASK;
 	struct bpf_event_entry *ee;
-	u64 value = 0;
-	int err;
 
 	if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
 		return -EINVAL;
@@ -275,7 +290,15 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
 	if (!ee)
 		return -ENOENT;
 
-	err = perf_event_read_local(ee->event, &value, NULL, NULL);
+	return perf_event_read_local(ee->event, value, enabled, running);
+}
+
+BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
+{
+	u64 value = 0;
+	int err;
+
+	err = get_map_perf_counter(map, flags, &value, NULL, NULL);
 	/*
 	 * this api is ugly since we miss [-22..-2] range of valid
 	 * counter values, but that's uapi
@@ -293,6 +316,33 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_4(bpf_perf_event_read_value, struct bpf_map *, map, u64, flags,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = get_map_perf_counter(map, flags, &buf->counter, &buf->enabled,
+				   &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
+	.func		= bpf_perf_event_read_value,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_ANYTHING,
+	.arg3_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg4_type	= ARG_CONST_SIZE,
+};
+
 static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
 
 static __always_inline u64
@@ -353,7 +403,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -444,7 +494,7 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = {
 	.gpl_only	= true,
 	.ret_type	= RET_INTEGER,
 	.arg1_type	= ARG_PTR_TO_UNINIT_MEM,
-	.arg2_type	= ARG_CONST_SIZE,
+	.arg2_type	= ARG_CONST_SIZE_OR_ZERO,
 	.arg3_type	= ARG_ANYTHING,
 };
 
@@ -499,6 +549,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_perf_event_output_proto;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto;
+	case BPF_FUNC_perf_event_read_value:
+		return &bpf_perf_event_read_value_proto;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -524,11 +576,14 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
 	return true;
 }
 
-const struct bpf_verifier_ops kprobe_prog_ops = {
+const struct bpf_verifier_ops kprobe_verifier_ops = {
 	.get_func_proto  = kprobe_prog_func_proto,
 	.is_valid_access = kprobe_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops kprobe_prog_ops = {
+};
+
 BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map,
 	   u64, flags, void *, data, u64, size)
 {
@@ -550,7 +605,7 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
 	.arg2_type	= ARG_CONST_MAP_PTR,
 	.arg3_type	= ARG_ANYTHING,
 	.arg4_type	= ARG_PTR_TO_MEM,
-	.arg5_type	= ARG_CONST_SIZE,
+	.arg5_type	= ARG_CONST_SIZE_OR_ZERO,
 };
 
 BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
@@ -576,6 +631,32 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_value_tp, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_perf_event_value *, buf, u32, size)
+{
+	int err = -EINVAL;
+
+	if (unlikely(size != sizeof(struct bpf_perf_event_value)))
+		goto clear;
+	err = perf_event_read_local(ctx->event, &buf->counter, &buf->enabled,
+				    &buf->running);
+	if (unlikely(err))
+		goto clear;
+	return 0;
+clear:
+	memset(buf, 0, size);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_value_proto_tp = {
+         .func           = bpf_perf_prog_read_value_tp,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -583,6 +664,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
 		return &bpf_get_stackid_proto_tp;
+	case BPF_FUNC_perf_prog_read_value:
+		return &bpf_perf_prog_read_value_proto_tp;
 	default:
 		return tracing_func_proto(func_id);
 	}
@@ -602,11 +685,14 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
 	return true;
 }
 
-const struct bpf_verifier_ops tracepoint_prog_ops = {
+const struct bpf_verifier_ops tracepoint_verifier_ops = {
 	.get_func_proto  = tp_prog_func_proto,
 	.is_valid_access = tp_prog_is_valid_access,
 };
 
+const struct bpf_prog_ops tracepoint_prog_ops = {
+};
+
 static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
 				    struct bpf_insn_access_aux *info)
 {
@@ -662,8 +748,75 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
 	return insn - insn_buf;
 }
 
-const struct bpf_verifier_ops perf_event_prog_ops = {
+const struct bpf_verifier_ops perf_event_verifier_ops = {
 	.get_func_proto		= tp_prog_func_proto,
 	.is_valid_access	= pe_prog_is_valid_access,
 	.convert_ctx_access	= pe_prog_convert_ctx_access,
 };
+
+const struct bpf_prog_ops perf_event_prog_ops = {
+};
+
+static DEFINE_MUTEX(bpf_event_mutex);
+
+#define BPF_TRACE_MAX_PROGS 64
+
+int perf_event_attach_bpf_prog(struct perf_event *event,
+			       struct bpf_prog *prog)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret = -EEXIST;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (event->prog)
+		goto unlock;
+
+	old_array = event->tp_event->prog_array;
+	if (old_array &&
+	    bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
+		ret = -E2BIG;
+		goto unlock;
+	}
+
+	ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
+	if (ret < 0)
+		goto unlock;
+
+	/* set the new array to event->tp_event and set event->prog */
+	event->prog = prog;
+	rcu_assign_pointer(event->tp_event->prog_array, new_array);
+	bpf_prog_array_free(old_array);
+
+unlock:
+	mutex_unlock(&bpf_event_mutex);
+	return ret;
+}
+
+void perf_event_detach_bpf_prog(struct perf_event *event)
+{
+	struct bpf_prog_array __rcu *old_array;
+	struct bpf_prog_array *new_array;
+	int ret;
+
+	mutex_lock(&bpf_event_mutex);
+
+	if (!event->prog)
+		goto unlock;
+
+	old_array = event->tp_event->prog_array;
+	ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array);
+	if (ret < 0) {
+		bpf_prog_array_delete_safe(old_array, event->prog);
+	} else {
+		rcu_assign_pointer(event->tp_event->prog_array, new_array);
+		bpf_prog_array_free(old_array);
+	}
+
+	bpf_prog_put(event->prog);
+	event->prog = NULL;
+
+unlock:
+	mutex_unlock(&bpf_event_mutex);
+}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8319e09e15b9..ccdf3664e4a9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -203,30 +203,6 @@ void clear_ftrace_function(void)
 	ftrace_trace_function = ftrace_stub;
 }
 
-static void per_cpu_ops_disable_all(struct ftrace_ops *ops)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		*per_cpu_ptr(ops->disabled, cpu) = 1;
-}
-
-static int per_cpu_ops_alloc(struct ftrace_ops *ops)
-{
-	int __percpu *disabled;
-
-	if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU)))
-		return -EINVAL;
-
-	disabled = alloc_percpu(int);
-	if (!disabled)
-		return -ENOMEM;
-
-	ops->disabled = disabled;
-	per_cpu_ops_disable_all(ops);
-	return 0;
-}
-
 static void ftrace_sync(struct work_struct *work)
 {
 	/*
@@ -262,8 +238,8 @@ static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops)
 	 * If this is a dynamic, RCU, or per CPU ops, or we force list func,
 	 * then it needs to call the list anyway.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU |
-			  FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC)
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_RCU) ||
+	    FTRACE_FORCE_LIST_FUNC)
 		return ftrace_ops_list_func;
 
 	return ftrace_ops_get_func(ops);
@@ -422,11 +398,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	if (!core_kernel_data((unsigned long)ops))
 		ops->flags |= FTRACE_OPS_FL_DYNAMIC;
 
-	if (ops->flags & FTRACE_OPS_FL_PER_CPU) {
-		if (per_cpu_ops_alloc(ops))
-			return -ENOMEM;
-	}
-
 	add_ftrace_ops(&ftrace_ops_list, ops);
 
 	/* Always save the function, and reset at unregistering */
@@ -2727,11 +2698,6 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops)
 {
 }
 
-static void per_cpu_ops_free(struct ftrace_ops *ops)
-{
-	free_percpu(ops->disabled);
-}
-
 static void ftrace_startup_enable(int command)
 {
 	if (saved_ftrace_func != ftrace_trace_function) {
@@ -2833,7 +2799,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		 * not currently active, we can just free them
 		 * without synchronizing all CPUs.
 		 */
-		if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+		if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
 			goto free_ops;
 
 		return 0;
@@ -2880,7 +2846,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 	 * The same goes for freeing the per_cpu data of the per_cpu
 	 * ops.
 	 */
-	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
+	if (ops->flags & FTRACE_OPS_FL_DYNAMIC) {
 		/*
 		 * We need to do a hard force of sched synchronization.
 		 * This is because we use preempt_disable() to do RCU, but
@@ -2903,9 +2869,6 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 
  free_ops:
 		arch_ftrace_trampoline_free(ops);
-
-		if (ops->flags & FTRACE_OPS_FL_PER_CPU)
-			per_cpu_ops_free(ops);
 	}
 
 	return 0;
@@ -5672,10 +5635,29 @@ static int ftrace_process_locs(struct module *mod,
 	return ret;
 }
 
+struct ftrace_mod_func {
+	struct list_head	list;
+	char			*name;
+	unsigned long		ip;
+	unsigned int		size;
+};
+
+struct ftrace_mod_map {
+	struct rcu_head		rcu;
+	struct list_head	list;
+	struct module		*mod;
+	unsigned long		start_addr;
+	unsigned long		end_addr;
+	struct list_head	funcs;
+	unsigned int		num_funcs;
+};
+
 #ifdef CONFIG_MODULES
 
 #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next)
 
+static LIST_HEAD(ftrace_mod_maps);
+
 static int referenced_filters(struct dyn_ftrace *rec)
 {
 	struct ftrace_ops *ops;
@@ -5729,8 +5711,26 @@ static void clear_mod_from_hashes(struct ftrace_page *pg)
 	mutex_unlock(&trace_types_lock);
 }
 
+static void ftrace_free_mod_map(struct rcu_head *rcu)
+{
+	struct ftrace_mod_map *mod_map = container_of(rcu, struct ftrace_mod_map, rcu);
+	struct ftrace_mod_func *mod_func;
+	struct ftrace_mod_func *n;
+
+	/* All the contents of mod_map are now not visible to readers */
+	list_for_each_entry_safe(mod_func, n, &mod_map->funcs, list) {
+		kfree(mod_func->name);
+		list_del(&mod_func->list);
+		kfree(mod_func);
+	}
+
+	kfree(mod_map);
+}
+
 void ftrace_release_mod(struct module *mod)
 {
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_map *n;
 	struct dyn_ftrace *rec;
 	struct ftrace_page **last_pg;
 	struct ftrace_page *tmp_page = NULL;
@@ -5742,6 +5742,14 @@ void ftrace_release_mod(struct module *mod)
 	if (ftrace_disabled)
 		goto out_unlock;
 
+	list_for_each_entry_safe(mod_map, n, &ftrace_mod_maps, list) {
+		if (mod_map->mod == mod) {
+			list_del_rcu(&mod_map->list);
+			call_rcu_sched(&mod_map->rcu, ftrace_free_mod_map);
+			break;
+		}
+	}
+
 	/*
 	 * Each module has its own ftrace_pages, remove
 	 * them from the list.
@@ -5749,7 +5757,8 @@ void ftrace_release_mod(struct module *mod)
 	last_pg = &ftrace_pages_start;
 	for (pg = ftrace_pages_start; pg; pg = *last_pg) {
 		rec = &pg->records[0];
-		if (within_module_core(rec->ip, mod)) {
+		if (within_module_core(rec->ip, mod) ||
+		    within_module_init(rec->ip, mod)) {
 			/*
 			 * As core pages are first, the first
 			 * page should never be a module page.
@@ -5818,7 +5827,8 @@ void ftrace_module_enable(struct module *mod)
 		 * not part of this module, then skip this pg,
 		 * which the "break" will do.
 		 */
-		if (!within_module_core(rec->ip, mod))
+		if (!within_module_core(rec->ip, mod) &&
+		    !within_module_init(rec->ip, mod))
 			break;
 
 		cnt = 0;
@@ -5863,23 +5873,245 @@ void ftrace_module_init(struct module *mod)
 	ftrace_process_locs(mod, mod->ftrace_callsites,
 			    mod->ftrace_callsites + mod->num_ftrace_callsites);
 }
+
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec)
+{
+	struct ftrace_mod_func *mod_func;
+	unsigned long symsize;
+	unsigned long offset;
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+	const char *ret;
+
+	ret = kallsyms_lookup(rec->ip, &symsize, &offset, &modname, str);
+	if (!ret)
+		return;
+
+	mod_func = kmalloc(sizeof(*mod_func), GFP_KERNEL);
+	if (!mod_func)
+		return;
+
+	mod_func->name = kstrdup(str, GFP_KERNEL);
+	if (!mod_func->name) {
+		kfree(mod_func);
+		return;
+	}
+
+	mod_func->ip = rec->ip - offset;
+	mod_func->size = symsize;
+
+	mod_map->num_funcs++;
+
+	list_add_rcu(&mod_func->list, &mod_map->funcs);
+}
+
+static struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	struct ftrace_mod_map *mod_map;
+
+	mod_map = kmalloc(sizeof(*mod_map), GFP_KERNEL);
+	if (!mod_map)
+		return NULL;
+
+	mod_map->mod = mod;
+	mod_map->start_addr = start;
+	mod_map->end_addr = end;
+	mod_map->num_funcs = 0;
+
+	INIT_LIST_HEAD_RCU(&mod_map->funcs);
+
+	list_add_rcu(&mod_map->list, &ftrace_mod_maps);
+
+	return mod_map;
+}
+
+static const char *
+ftrace_func_address_lookup(struct ftrace_mod_map *mod_map,
+			   unsigned long addr, unsigned long *size,
+			   unsigned long *off, char *sym)
+{
+	struct ftrace_mod_func *found_func =  NULL;
+	struct ftrace_mod_func *mod_func;
+
+	list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+		if (addr >= mod_func->ip &&
+		    addr < mod_func->ip + mod_func->size) {
+			found_func = mod_func;
+			break;
+		}
+	}
+
+	if (found_func) {
+		if (size)
+			*size = found_func->size;
+		if (off)
+			*off = addr - found_func->ip;
+		if (sym)
+			strlcpy(sym, found_func->name, KSYM_NAME_LEN);
+
+		return found_func->name;
+	}
+
+	return NULL;
+}
+
+const char *
+ftrace_mod_address_lookup(unsigned long addr, unsigned long *size,
+		   unsigned long *off, char **modname, char *sym)
+{
+	struct ftrace_mod_map *mod_map;
+	const char *ret = NULL;
+
+	/* mod_map is freed via call_rcu_sched() */
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+		ret = ftrace_func_address_lookup(mod_map, addr, size, off, sym);
+		if (ret) {
+			if (modname)
+				*modname = mod_map->mod->name;
+			break;
+		}
+	}
+	preempt_enable();
+
+	return ret;
+}
+
+int ftrace_mod_get_kallsym(unsigned int symnum, unsigned long *value,
+			   char *type, char *name,
+			   char *module_name, int *exported)
+{
+	struct ftrace_mod_map *mod_map;
+	struct ftrace_mod_func *mod_func;
+
+	preempt_disable();
+	list_for_each_entry_rcu(mod_map, &ftrace_mod_maps, list) {
+
+		if (symnum >= mod_map->num_funcs) {
+			symnum -= mod_map->num_funcs;
+			continue;
+		}
+
+		list_for_each_entry_rcu(mod_func, &mod_map->funcs, list) {
+			if (symnum > 1) {
+				symnum--;
+				continue;
+			}
+
+			*value = mod_func->ip;
+			*type = 'T';
+			strlcpy(name, mod_func->name, KSYM_NAME_LEN);
+			strlcpy(module_name, mod_map->mod->name, MODULE_NAME_LEN);
+			*exported = 1;
+			preempt_enable();
+			return 0;
+		}
+		WARN_ON(1);
+		break;
+	}
+	preempt_enable();
+	return -ERANGE;
+}
+
+#else
+static void save_ftrace_mod_rec(struct ftrace_mod_map *mod_map,
+				struct dyn_ftrace *rec) { }
+static inline struct ftrace_mod_map *
+allocate_ftrace_mod_map(struct module *mod,
+			unsigned long start, unsigned long end)
+{
+	return NULL;
+}
 #endif /* CONFIG_MODULES */
 
-void __init ftrace_free_init_mem(void)
+struct ftrace_init_func {
+	struct list_head list;
+	unsigned long ip;
+};
+
+/* Clear any init ips from hashes */
+static void
+clear_func_from_hash(struct ftrace_init_func *func, struct ftrace_hash *hash)
+{
+	struct ftrace_func_entry *entry;
+
+	if (ftrace_hash_empty(hash))
+		return;
+
+	entry = __ftrace_lookup_ip(hash, func->ip);
+
+	/*
+	 * Do not allow this rec to match again.
+	 * Yeah, it may waste some memory, but will be removed
+	 * if/when the hash is modified again.
+	 */
+	if (entry)
+		entry->ip = 0;
+}
+
+static void
+clear_func_from_hashes(struct ftrace_init_func *func)
+{
+	struct trace_array *tr;
+
+	mutex_lock(&trace_types_lock);
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (!tr->ops || !tr->ops->func_hash)
+			continue;
+		mutex_lock(&tr->ops->func_hash->regex_lock);
+		clear_func_from_hash(func, tr->ops->func_hash->filter_hash);
+		clear_func_from_hash(func, tr->ops->func_hash->notrace_hash);
+		mutex_unlock(&tr->ops->func_hash->regex_lock);
+	}
+	mutex_unlock(&trace_types_lock);
+}
+
+static void add_to_clear_hash_list(struct list_head *clear_list,
+				   struct dyn_ftrace *rec)
+{
+	struct ftrace_init_func *func;
+
+	func = kmalloc(sizeof(*func), GFP_KERNEL);
+	if (!func) {
+		WARN_ONCE(1, "alloc failure, ftrace filter could be stale\n");
+		return;
+	}
+
+	func->ip = rec->ip;
+	list_add(&func->list, clear_list);
+}
+
+void ftrace_free_mem(struct module *mod, void *start_ptr, void *end_ptr)
 {
-	unsigned long start = (unsigned long)(&__init_begin);
-	unsigned long end = (unsigned long)(&__init_end);
+	unsigned long start = (unsigned long)(start_ptr);
+	unsigned long end = (unsigned long)(end_ptr);
 	struct ftrace_page **last_pg = &ftrace_pages_start;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	struct dyn_ftrace key;
+	struct ftrace_mod_map *mod_map = NULL;
+	struct ftrace_init_func *func, *func_next;
+	struct list_head clear_hash;
 	int order;
 
+	INIT_LIST_HEAD(&clear_hash);
+
 	key.ip = start;
 	key.flags = end;	/* overload flags, as it is unsigned long */
 
 	mutex_lock(&ftrace_lock);
 
+	/*
+	 * If we are freeing module init memory, then check if
+	 * any tracer is active. If so, we need to save a mapping of
+	 * the module functions being freed with the address.
+	 */
+	if (mod && ftrace_ops_list != &ftrace_list_end)
+		mod_map = allocate_ftrace_mod_map(mod, start, end);
+
 	for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) {
 		if (end < pg->records[0].ip ||
 		    start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE))
@@ -5890,6 +6122,13 @@ void __init ftrace_free_init_mem(void)
 			      ftrace_cmp_recs);
 		if (!rec)
 			continue;
+
+		/* rec will be cleared from hashes after ftrace_lock unlock */
+		add_to_clear_hash_list(&clear_hash, rec);
+
+		if (mod_map)
+			save_ftrace_mod_rec(mod_map, rec);
+
 		pg->index--;
 		ftrace_update_tot_cnt--;
 		if (!pg->index) {
@@ -5908,6 +6147,19 @@ void __init ftrace_free_init_mem(void)
 		goto again;
 	}
 	mutex_unlock(&ftrace_lock);
+
+	list_for_each_entry_safe(func, func_next, &clear_hash, list) {
+		clear_func_from_hashes(func);
+		kfree(func);
+	}
+}
+
+void __init ftrace_free_init_mem(void)
+{
+	void *start = (void *)(&__init_begin);
+	void *end = (void *)(&__init_end);
+
+	ftrace_free_mem(NULL, start, end);
 }
 
 void __init ftrace_init(void)
@@ -6063,10 +6315,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
 		 * If any of the above fails then the op->func() is not executed.
 		 */
 		if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) &&
-		    (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-		     !ftrace_function_local_disabled(op)) &&
 		    ftrace_ops_test(op, ip, regs)) {
-		    
 			if (FTRACE_WARN_ON(!op->func)) {
 				pr_warn("op=%p %pS\n", op, op);
 				goto out;
@@ -6124,10 +6373,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
 
 	preempt_disable_notrace();
 
-	if (!(op->flags & FTRACE_OPS_FL_PER_CPU) ||
-	    !ftrace_function_local_disabled(op)) {
-		op->func(ip, parent_ip, op, regs);
-	}
+	op->func(ip, parent_ip, op, regs);
 
 	preempt_enable_notrace();
 	trace_clear_recursion(bit);
@@ -6151,7 +6397,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
 	 * or does per cpu logic, then we need to call the assist handler.
 	 */
 	if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) ||
-	    ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU))
+	    ops->flags & FTRACE_OPS_FL_RCU)
 		return ftrace_ops_assist_func;
 
 	return ops->func;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 845f3805c73d..91874a95060d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -13,7 +13,6 @@
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>	/* for self test */
-#include <linux/kmemcheck.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
 	}
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 
 	/* account for padding bytes */
 	local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
@@ -2538,61 +2536,29 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
  * The lock and unlock are done within a preempt disable section.
  * The current_context per_cpu variable can only be modified
  * by the current task between lock and unlock. But it can
- * be modified more than once via an interrupt. To pass this
- * information from the lock to the unlock without having to
- * access the 'in_interrupt()' functions again (which do show
- * a bit of overhead in something as critical as function tracing,
- * we use a bitmask trick.
+ * be modified more than once via an interrupt. There are four
+ * different contexts that we need to consider.
  *
- *  bit 0 =  NMI context
- *  bit 1 =  IRQ context
- *  bit 2 =  SoftIRQ context
- *  bit 3 =  normal context.
+ *  Normal context.
+ *  SoftIRQ context
+ *  IRQ context
+ *  NMI context
  *
- * This works because this is the order of contexts that can
- * preempt other contexts. A SoftIRQ never preempts an IRQ
- * context.
- *
- * When the context is determined, the corresponding bit is
- * checked and set (if it was set, then a recursion of that context
- * happened).
- *
- * On unlock, we need to clear this bit. To do so, just subtract
- * 1 from the current_context and AND it to itself.
- *
- * (binary)
- *  101 - 1 = 100
- *  101 & 100 = 100 (clearing bit zero)
- *
- *  1010 - 1 = 1001
- *  1010 & 1001 = 1000 (clearing bit 1)
- *
- * The least significant bit can be cleared this way, and it
- * just so happens that it is the same bit corresponding to
- * the current context.
+ * If for some reason the ring buffer starts to recurse, we
+ * only allow that to happen at most 4 times (one for each
+ * context). If it happens 5 times, then we consider this a
+ * recusive loop and do not let it go further.
  */
 
 static __always_inline int
 trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	unsigned int val = cpu_buffer->current_context;
-	int bit;
-
-	if (in_interrupt()) {
-		if (in_nmi())
-			bit = RB_CTX_NMI;
-		else if (in_irq())
-			bit = RB_CTX_IRQ;
-		else
-			bit = RB_CTX_SOFTIRQ;
-	} else
-		bit = RB_CTX_NORMAL;
-
-	if (unlikely(val & (1 << bit)))
+	if (cpu_buffer->current_context >= 4)
 		return 1;
 
-	val |= (1 << bit);
-	cpu_buffer->current_context = val;
+	cpu_buffer->current_context++;
+	/* Interrupts must see this update */
+	barrier();
 
 	return 0;
 }
@@ -2600,7 +2566,9 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
 static __always_inline void
 trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
 {
-	cpu_buffer->current_context &= cpu_buffer->current_context - 1;
+	/* Don't let the dec leak out */
+	barrier();
+	cpu_buffer->current_context--;
 }
 
 /**
@@ -2686,7 +2654,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	/* We reserved something on the buffer */
 
 	event = __rb_page_index(tail_page, tail);
-	kmemcheck_annotate_bitfield(event, bitfield);
 	rb_update_event(cpu_buffer, event, info);
 
 	local_inc(&tail_page->entries);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 752e5daf0896..73e67b68c53b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -7687,6 +7687,7 @@ static int instance_mkdir(const char *name)
 	struct trace_array *tr;
 	int ret;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
 	ret = -EEXIST;
@@ -7742,6 +7743,7 @@ static int instance_mkdir(const char *name)
 	list_add(&tr->list, &ftrace_trace_arrays);
 
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return 0;
 
@@ -7753,6 +7755,7 @@ static int instance_mkdir(const char *name)
 
  out_unlock:
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 
@@ -7765,6 +7768,7 @@ static int instance_rmdir(const char *name)
 	int ret;
 	int i;
 
+	mutex_lock(&event_mutex);
 	mutex_lock(&trace_types_lock);
 
 	ret = -ENODEV;
@@ -7810,6 +7814,7 @@ static int instance_rmdir(const char *name)
 
  out_unlock:
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
@@ -8276,6 +8281,92 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 }
 EXPORT_SYMBOL_GPL(ftrace_dump);
 
+int trace_run_command(const char *buf, int (*createfn)(int, char **))
+{
+	char **argv;
+	int argc, ret;
+
+	argc = 0;
+	ret = 0;
+	argv = argv_split(GFP_KERNEL, buf, &argc);
+	if (!argv)
+		return -ENOMEM;
+
+	if (argc)
+		ret = createfn(argc, argv);
+
+	argv_free(argv);
+
+	return ret;
+}
+
+#define WRITE_BUFSIZE  4096
+
+ssize_t trace_parse_run_command(struct file *file, const char __user *buffer,
+				size_t count, loff_t *ppos,
+				int (*createfn)(int, char **))
+{
+	char *kbuf, *buf, *tmp;
+	int ret = 0;
+	size_t done = 0;
+	size_t size;
+
+	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	while (done < count) {
+		size = count - done;
+
+		if (size >= WRITE_BUFSIZE)
+			size = WRITE_BUFSIZE - 1;
+
+		if (copy_from_user(kbuf, buffer + done, size)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		kbuf[size] = '\0';
+		buf = kbuf;
+		do {
+			tmp = strchr(buf, '\n');
+			if (tmp) {
+				*tmp = '\0';
+				size = tmp - buf + 1;
+			} else {
+				size = strlen(buf);
+				if (done + size < count) {
+					if (buf != kbuf)
+						break;
+					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
+					pr_warn("Line length is too long: Should be less than %d\n",
+						WRITE_BUFSIZE - 2);
+					ret = -EINVAL;
+					goto out;
+				}
+			}
+			done += size;
+
+			/* Remove comments */
+			tmp = strchr(buf, '#');
+
+			if (tmp)
+				*tmp = '\0';
+
+			ret = trace_run_command(buf, createfn);
+			if (ret)
+				goto out;
+			buf += size;
+
+		} while (done < count);
+	}
+	ret = done;
+
+out:
+	kfree(kbuf);
+
+	return ret;
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	int ring_buf_size;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6b0b343a36a2..2a6d0325a761 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,8 +739,6 @@ extern int trace_selftest_startup_wakeup(struct tracer *trace,
 					 struct trace_array *tr);
 extern int trace_selftest_startup_nop(struct tracer *trace,
 					 struct trace_array *tr);
-extern int trace_selftest_startup_sched_switch(struct tracer *trace,
-					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
 /*
@@ -1755,6 +1753,13 @@ void trace_printk_start_comm(void);
 int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
 int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
 
+#define MAX_EVENT_NAME_LEN	64
+
+extern int trace_run_command(const char *buf, int (*createfn)(int, char**));
+extern ssize_t trace_parse_run_command(struct file *file,
+		const char __user *buffer, size_t count, loff_t *ppos,
+		int (*createfn)(int, char**));
+
 /*
  * Normal trace_printk() and friends allocates special buffers
  * to do the manipulation, as well as saves the print formats
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 13ba2d3f6a91..55d6dff37daf 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -240,27 +240,41 @@ void perf_trace_destroy(struct perf_event *p_event)
 int perf_trace_add(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	struct hlist_head __percpu *pcpu_list;
-	struct hlist_head *list;
-
-	pcpu_list = tp_event->perf_events;
-	if (WARN_ON_ONCE(!pcpu_list))
-		return -EINVAL;
 
 	if (!(flags & PERF_EF_START))
 		p_event->hw.state = PERF_HES_STOPPED;
 
-	list = this_cpu_ptr(pcpu_list);
-	hlist_add_head_rcu(&p_event->hlist_entry, list);
+	/*
+	 * If TRACE_REG_PERF_ADD returns false; no custom action was performed
+	 * and we need to take the default action of enqueueing our event on
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
+		struct hlist_head __percpu *pcpu_list;
+		struct hlist_head *list;
+
+		pcpu_list = tp_event->perf_events;
+		if (WARN_ON_ONCE(!pcpu_list))
+			return -EINVAL;
+
+		list = this_cpu_ptr(pcpu_list);
+		hlist_add_head_rcu(&p_event->hlist_entry, list);
+	}
 
-	return tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event);
+	return 0;
 }
 
 void perf_trace_del(struct perf_event *p_event, int flags)
 {
 	struct trace_event_call *tp_event = p_event->tp_event;
-	hlist_del_rcu(&p_event->hlist_entry);
-	tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
+
+	/*
+	 * If TRACE_REG_PERF_DEL returns false; no custom action was performed
+	 * and we need to take the default action of dequeueing our event from
+	 * the right per-cpu hlist.
+	 */
+	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
+		hlist_del_rcu(&p_event->hlist_entry);
 }
 
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp)
@@ -306,16 +320,25 @@ static void
 perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 			  struct ftrace_ops *ops, struct pt_regs *pt_regs)
 {
-	struct perf_event *event;
 	struct ftrace_entry *entry;
-	struct hlist_head *head;
+	struct perf_event *event;
+	struct hlist_head head;
 	struct pt_regs regs;
 	int rctx;
 
-	head = this_cpu_ptr(event_function.perf_events);
-	if (hlist_empty(head))
+	if ((unsigned long)ops->private != smp_processor_id())
 		return;
 
+	event = container_of(ops, struct perf_event, ftrace_ops);
+
+	/*
+	 * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
+	 * the perf code does is hlist_for_each_entry_rcu(), so we can
+	 * get away with simply setting the @head.first pointer in order
+	 * to create a singular list.
+	 */
+	head.first = &event->hlist_entry;
+
 #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
 		    sizeof(u64)) - sizeof(u32))
 
@@ -330,9 +353,8 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
 
 	entry->ip = ip;
 	entry->parent_ip = parent_ip;
-	event = container_of(ops, struct perf_event, ftrace_ops);
 	perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
-			      1, &regs, head, NULL, event);
+			      1, &regs, &head, NULL);
 
 #undef ENTRY_SIZE
 }
@@ -341,8 +363,10 @@ static int perf_ftrace_function_register(struct perf_event *event)
 {
 	struct ftrace_ops *ops = &event->ftrace_ops;
 
-	ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU;
-	ops->func = perf_ftrace_function_call;
+	ops->flags   = FTRACE_OPS_FL_RCU;
+	ops->func    = perf_ftrace_function_call;
+	ops->private = (void *)(unsigned long)nr_cpu_ids;
+
 	return register_ftrace_function(ops);
 }
 
@@ -354,19 +378,11 @@ static int perf_ftrace_function_unregister(struct perf_event *event)
 	return ret;
 }
 
-static void perf_ftrace_function_enable(struct perf_event *event)
-{
-	ftrace_function_local_enable(&event->ftrace_ops);
-}
-
-static void perf_ftrace_function_disable(struct perf_event *event)
-{
-	ftrace_function_local_disable(&event->ftrace_ops);
-}
-
 int perf_ftrace_event_register(struct trace_event_call *call,
 			       enum trace_reg type, void *data)
 {
+	struct perf_event *event = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_UNREGISTER:
@@ -379,11 +395,11 @@ int perf_ftrace_event_register(struct trace_event_call *call,
 	case TRACE_REG_PERF_CLOSE:
 		return perf_ftrace_function_unregister(data);
 	case TRACE_REG_PERF_ADD:
-		perf_ftrace_function_enable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id();
+		return 1;
 	case TRACE_REG_PERF_DEL:
-		perf_ftrace_function_disable(data);
-		return 0;
+		event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids;
+		return 1;
 	}
 
 	return -EINVAL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87468398b9ed..ec0f9aa4e151 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1406,8 +1406,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 		return -ENODEV;
 
 	/* Make sure the system still exists */
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
 		list_for_each_entry(dir, &tr->systems, list) {
 			if (dir == inode->i_private) {
@@ -1421,8 +1421,8 @@ static int subsystem_open(struct inode *inode, struct file *filp)
 		}
 	}
  exit_loop:
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	if (!system)
 		return -ENODEV;
@@ -2294,15 +2294,15 @@ static void __add_event_to_tracers(struct trace_event_call *call);
 int trace_add_event_call(struct trace_event_call *call)
 {
 	int ret;
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 
 	ret = __register_event(call, NULL);
 	if (ret >= 0)
 		__add_event_to_tracers(call);
 
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 	return ret;
 }
 
@@ -2356,13 +2356,13 @@ int trace_remove_event_call(struct trace_event_call *call)
 {
 	int ret;
 
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	down_write(&trace_event_sem);
 	ret = probe_remove_event_call(call);
 	up_write(&trace_event_sem);
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return ret;
 }
@@ -2424,8 +2424,8 @@ static int trace_module_notify(struct notifier_block *self,
 {
 	struct module *mod = data;
 
-	mutex_lock(&trace_types_lock);
 	mutex_lock(&event_mutex);
+	mutex_lock(&trace_types_lock);
 	switch (val) {
 	case MODULE_STATE_COMING:
 		trace_module_add_events(mod);
@@ -2434,8 +2434,8 @@ static int trace_module_notify(struct notifier_block *self,
 		trace_module_remove_events(mod);
 		break;
 	}
-	mutex_unlock(&event_mutex);
 	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
@@ -2950,24 +2950,24 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
  * creates the event hierachry in the @parent/events directory.
  *
  * Returns 0 on success.
+ *
+ * Must be called with event_mutex held.
  */
 int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
 {
 	int ret;
 
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	ret = create_event_toplevel_files(parent, tr);
 	if (ret)
-		goto out_unlock;
+		goto out;
 
 	down_write(&trace_event_sem);
 	__trace_add_event_dirs(tr);
 	up_write(&trace_event_sem);
 
- out_unlock:
-	mutex_unlock(&event_mutex);
-
+ out:
 	return ret;
 }
 
@@ -2996,9 +2996,10 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
 	return ret;
 }
 
+/* Must be called with event_mutex held */
 int event_trace_del_tracer(struct trace_array *tr)
 {
-	mutex_lock(&event_mutex);
+	lockdep_assert_held(&event_mutex);
 
 	/* Disable any event triggers and associated soft-disabled events */
 	clear_event_triggers(tr);
@@ -3019,8 +3020,6 @@ int event_trace_del_tracer(struct trace_array *tr)
 
 	tr->event_dir = NULL;
 
-	mutex_unlock(&event_mutex);
-
 	return 0;
 }
 
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 1c21d0e2a145..1e1558c99d56 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -28,12 +28,16 @@ struct hist_field;
 
 typedef u64 (*hist_field_fn_t) (struct hist_field *field, void *event);
 
+#define HIST_FIELD_OPERANDS_MAX	2
+
 struct hist_field {
 	struct ftrace_event_field	*field;
 	unsigned long			flags;
 	hist_field_fn_t			fn;
 	unsigned int			size;
 	unsigned int			offset;
+	unsigned int                    is_signed;
+	struct hist_field		*operands[HIST_FIELD_OPERANDS_MAX];
 };
 
 static u64 hist_field_none(struct hist_field *field, void *event)
@@ -71,7 +75,9 @@ static u64 hist_field_pstring(struct hist_field *hist_field, void *event)
 
 static u64 hist_field_log2(struct hist_field *hist_field, void *event)
 {
-	u64 val = *(u64 *)(event + hist_field->field->offset);
+	struct hist_field *operand = hist_field->operands[0];
+
+	u64 val = operand->fn(operand, event);
 
 	return (u64) ilog2(roundup_pow_of_two(val));
 }
@@ -110,16 +116,16 @@ DEFINE_HIST_FIELD_FN(u8);
 #define HIST_KEY_SIZE_MAX	(MAX_FILTER_STR_VAL + HIST_STACKTRACE_SIZE)
 
 enum hist_field_flags {
-	HIST_FIELD_FL_HITCOUNT		= 1,
-	HIST_FIELD_FL_KEY		= 2,
-	HIST_FIELD_FL_STRING		= 4,
-	HIST_FIELD_FL_HEX		= 8,
-	HIST_FIELD_FL_SYM		= 16,
-	HIST_FIELD_FL_SYM_OFFSET	= 32,
-	HIST_FIELD_FL_EXECNAME		= 64,
-	HIST_FIELD_FL_SYSCALL		= 128,
-	HIST_FIELD_FL_STACKTRACE	= 256,
-	HIST_FIELD_FL_LOG2		= 512,
+	HIST_FIELD_FL_HITCOUNT		= 1 << 0,
+	HIST_FIELD_FL_KEY		= 1 << 1,
+	HIST_FIELD_FL_STRING		= 1 << 2,
+	HIST_FIELD_FL_HEX		= 1 << 3,
+	HIST_FIELD_FL_SYM		= 1 << 4,
+	HIST_FIELD_FL_SYM_OFFSET	= 1 << 5,
+	HIST_FIELD_FL_EXECNAME		= 1 << 6,
+	HIST_FIELD_FL_SYSCALL		= 1 << 7,
+	HIST_FIELD_FL_STACKTRACE	= 1 << 8,
+	HIST_FIELD_FL_LOG2		= 1 << 9,
 };
 
 struct hist_trigger_attrs {
@@ -146,6 +152,25 @@ struct hist_trigger_data {
 	struct tracing_map		*map;
 };
 
+static const char *hist_field_name(struct hist_field *field,
+				   unsigned int level)
+{
+	const char *field_name = "";
+
+	if (level > 1)
+		return field_name;
+
+	if (field->field)
+		field_name = field->field->name;
+	else if (field->flags & HIST_FIELD_FL_LOG2)
+		field_name = hist_field_name(field->operands[0], ++level);
+
+	if (field_name == NULL)
+		field_name = "";
+
+	return field_name;
+}
+
 static hist_field_fn_t select_value_fn(int field_size, int field_is_signed)
 {
 	hist_field_fn_t fn = NULL;
@@ -340,8 +365,20 @@ static const struct tracing_map_ops hist_trigger_elt_comm_ops = {
 	.elt_init	= hist_trigger_elt_comm_init,
 };
 
-static void destroy_hist_field(struct hist_field *hist_field)
+static void destroy_hist_field(struct hist_field *hist_field,
+			       unsigned int level)
 {
+	unsigned int i;
+
+	if (level > 2)
+		return;
+
+	if (!hist_field)
+		return;
+
+	for (i = 0; i < HIST_FIELD_OPERANDS_MAX; i++)
+		destroy_hist_field(hist_field->operands[i], level + 1);
+
 	kfree(hist_field);
 }
 
@@ -368,7 +405,10 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 	}
 
 	if (flags & HIST_FIELD_FL_LOG2) {
+		unsigned long fl = flags & ~HIST_FIELD_FL_LOG2;
 		hist_field->fn = hist_field_log2;
+		hist_field->operands[0] = create_hist_field(field, fl);
+		hist_field->size = hist_field->operands[0]->size;
 		goto out;
 	}
 
@@ -388,7 +428,7 @@ static struct hist_field *create_hist_field(struct ftrace_event_field *field,
 		hist_field->fn = select_value_fn(field->size,
 						 field->is_signed);
 		if (!hist_field->fn) {
-			destroy_hist_field(hist_field);
+			destroy_hist_field(hist_field, 0);
 			return NULL;
 		}
 	}
@@ -405,7 +445,7 @@ static void destroy_hist_fields(struct hist_trigger_data *hist_data)
 
 	for (i = 0; i < TRACING_MAP_FIELDS_MAX; i++) {
 		if (hist_data->fields[i]) {
-			destroy_hist_field(hist_data->fields[i]);
+			destroy_hist_field(hist_data->fields[i], 0);
 			hist_data->fields[i] = NULL;
 		}
 	}
@@ -450,7 +490,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
 	}
 
 	field = trace_find_event_field(file->event_call, field_name);
-	if (!field) {
+	if (!field || !field->size) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -548,7 +588,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
 		}
 
 		field = trace_find_event_field(file->event_call, field_name);
-		if (!field) {
+		if (!field || !field->size) {
 			ret = -EINVAL;
 			goto out;
 		}
@@ -653,7 +693,6 @@ static int is_descending(const char *str)
 static int create_sort_keys(struct hist_trigger_data *hist_data)
 {
 	char *fields_str = hist_data->attrs->sort_key_str;
-	struct ftrace_event_field *field = NULL;
 	struct tracing_map_sort_key *sort_key;
 	int descending, ret = 0;
 	unsigned int i, j;
@@ -670,7 +709,9 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 	}
 
 	for (i = 0; i < TRACING_MAP_SORT_KEYS_MAX; i++) {
+		struct hist_field *hist_field;
 		char *field_str, *field_name;
+		const char *test_name;
 
 		sort_key = &hist_data->sort_keys[i];
 
@@ -703,8 +744,10 @@ static int create_sort_keys(struct hist_trigger_data *hist_data)
 		}
 
 		for (j = 1; j < hist_data->n_fields; j++) {
-			field = hist_data->fields[j]->field;
-			if (field && (strcmp(field_name, field->name) == 0)) {
+			hist_field = hist_data->fields[j];
+			test_name = hist_field_name(hist_field, 0);
+
+			if (strcmp(field_name, test_name) == 0) {
 				sort_key->field_idx = j;
 				descending = is_descending(field_str);
 				if (descending < 0) {
@@ -952,6 +995,7 @@ hist_trigger_entry_print(struct seq_file *m,
 	struct hist_field *key_field;
 	char str[KSYM_SYMBOL_LEN];
 	bool multiline = false;
+	const char *field_name;
 	unsigned int i;
 	u64 uval;
 
@@ -963,26 +1007,27 @@ hist_trigger_entry_print(struct seq_file *m,
 		if (i > hist_data->n_vals)
 			seq_puts(m, ", ");
 
+		field_name = hist_field_name(key_field, 0);
+
 		if (key_field->flags & HIST_FIELD_FL_HEX) {
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %llx",
-				   key_field->field->name, uval);
+			seq_printf(m, "%s: %llx", field_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM) {
 			uval = *(u64 *)(key + key_field->offset);
 			sprint_symbol_no_offset(str, uval);
-			seq_printf(m, "%s: [%llx] %-45s",
-				   key_field->field->name, uval, str);
+			seq_printf(m, "%s: [%llx] %-45s", field_name,
+				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_SYM_OFFSET) {
 			uval = *(u64 *)(key + key_field->offset);
 			sprint_symbol(str, uval);
-			seq_printf(m, "%s: [%llx] %-55s",
-				   key_field->field->name, uval, str);
+			seq_printf(m, "%s: [%llx] %-55s", field_name,
+				   uval, str);
 		} else if (key_field->flags & HIST_FIELD_FL_EXECNAME) {
 			char *comm = elt->private_data;
 
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %-16s[%10llu]",
-				   key_field->field->name, comm, uval);
+			seq_printf(m, "%s: %-16s[%10llu]", field_name,
+				   comm, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_SYSCALL) {
 			const char *syscall_name;
 
@@ -991,8 +1036,8 @@ hist_trigger_entry_print(struct seq_file *m,
 			if (!syscall_name)
 				syscall_name = "unknown_syscall";
 
-			seq_printf(m, "%s: %-30s[%3llu]",
-				   key_field->field->name, syscall_name, uval);
+			seq_printf(m, "%s: %-30s[%3llu]", field_name,
+				   syscall_name, uval);
 		} else if (key_field->flags & HIST_FIELD_FL_STACKTRACE) {
 			seq_puts(m, "stacktrace:\n");
 			hist_trigger_stacktrace_print(m,
@@ -1000,15 +1045,14 @@ hist_trigger_entry_print(struct seq_file *m,
 						      HIST_STACKTRACE_DEPTH);
 			multiline = true;
 		} else if (key_field->flags & HIST_FIELD_FL_LOG2) {
-			seq_printf(m, "%s: ~ 2^%-2llu", key_field->field->name,
+			seq_printf(m, "%s: ~ 2^%-2llu", field_name,
 				   *(u64 *)(key + key_field->offset));
 		} else if (key_field->flags & HIST_FIELD_FL_STRING) {
-			seq_printf(m, "%s: %-50s", key_field->field->name,
+			seq_printf(m, "%s: %-50s", field_name,
 				   (char *)(key + key_field->offset));
 		} else {
 			uval = *(u64 *)(key + key_field->offset);
-			seq_printf(m, "%s: %10llu", key_field->field->name,
-				   uval);
+			seq_printf(m, "%s: %10llu", field_name, uval);
 		}
 	}
 
@@ -1021,13 +1065,13 @@ hist_trigger_entry_print(struct seq_file *m,
 		   tracing_map_read_sum(elt, HITCOUNT_IDX));
 
 	for (i = 1; i < hist_data->n_vals; i++) {
+		field_name = hist_field_name(hist_data->fields[i], 0);
+
 		if (hist_data->fields[i]->flags & HIST_FIELD_FL_HEX) {
-			seq_printf(m, "  %s: %10llx",
-				   hist_data->fields[i]->field->name,
+			seq_printf(m, "  %s: %10llx", field_name,
 				   tracing_map_read_sum(elt, i));
 		} else {
-			seq_printf(m, "  %s: %10llu",
-				   hist_data->fields[i]->field->name,
+			seq_printf(m, "  %s: %10llu", field_name,
 				   tracing_map_read_sum(elt, i));
 		}
 	}
@@ -1062,7 +1106,7 @@ static void hist_trigger_show(struct seq_file *m,
 			      struct event_trigger_data *data, int n)
 {
 	struct hist_trigger_data *hist_data;
-	int n_entries, ret = 0;
+	int n_entries;
 
 	if (n > 0)
 		seq_puts(m, "\n\n");
@@ -1073,10 +1117,8 @@ static void hist_trigger_show(struct seq_file *m,
 
 	hist_data = data->private_data;
 	n_entries = print_entries(m, hist_data);
-	if (n_entries < 0) {
-		ret = n_entries;
+	if (n_entries < 0)
 		n_entries = 0;
-	}
 
 	seq_printf(m, "\nTotals:\n    Hits: %llu\n    Entries: %u\n    Dropped: %llu\n",
 		   (u64)atomic64_read(&hist_data->map->hits),
@@ -1142,7 +1184,9 @@ static const char *get_hist_field_flags(struct hist_field *hist_field)
 
 static void hist_field_print(struct seq_file *m, struct hist_field *hist_field)
 {
-	seq_printf(m, "%s", hist_field->field->name);
+	const char *field_name = hist_field_name(hist_field, 0);
+
+	seq_printf(m, "%s", field_name);
 	if (hist_field->flags) {
 		const char *flags_str = get_hist_field_flags(hist_field);
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7758bc0617cb..03ecb4465ee4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
 
 #include "trace.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
@@ -463,63 +467,43 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
 #else /* !CONFIG_PROVE_LOCKING */
 
 /*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
  * We are only interested in hardirq on/off events:
  */
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_on);
 
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
-EXPORT_SYMBOL(trace_hardirqs_off);
 
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
 {
 	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
 #endif /*  CONFIG_IRQSOFF_TRACER */
 
 #ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
 {
 	if (preempt_trace() && !irq_trace())
 		start_critical_timing(a0, a1);
@@ -781,3 +765,100 @@ __init static int init_irqsoff_tracer(void)
 	return 0;
 }
 core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+void trace_hardirqs_on(void)
+{
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+	tracer_hardirqs_on();
+
+	this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+	tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	if (!this_cpu_read(tracing_irq_cpu))
+		return;
+
+	trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+	tracer_hardirqs_on_caller(caller_addr);
+
+	this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	if (this_cpu_read(tracing_irq_cpu))
+		return;
+
+	this_cpu_write(tracing_irq_cpu, 1);
+
+	trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+	tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+	(defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	trace_preempt_enable_rcuidle(a0, a1);
+	tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	trace_preempt_disable_rcuidle(a0, a1);
+	tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 8a907e12b6b9..492700c5fb4d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -907,8 +907,8 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return traceprobe_probes_write(file, buffer, count, ppos,
-			create_trace_kprobe);
+	return trace_parse_run_command(file, buffer, count, ppos,
+				       create_trace_kprobe);
 }
 
 static const struct file_operations kprobe_events_ops = {
@@ -1174,13 +1174,12 @@ static void
 kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
@@ -1200,7 +1199,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
 	memset(&entry[1], 0, dsize);
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kprobe_perf_func);
 
@@ -1210,13 +1209,12 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 		    struct pt_regs *regs)
 {
 	struct trace_event_call *call = &tk->tp.call;
-	struct bpf_prog *prog = call->prog;
 	struct kretprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	int size, __size, dsize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	head = this_cpu_ptr(call->perf_events);
@@ -1236,7 +1234,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
 	entry->ret_ip = (unsigned long)ri->ret_addr;
 	store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 NOKPROBE_SYMBOL(kretprobe_perf_func);
 #endif	/* CONFIG_PERF_EVENTS */
@@ -1433,9 +1431,9 @@ static __init int kprobe_trace_self_tests_init(void)
 
 	pr_info("Testing kprobe tracing: ");
 
-	ret = traceprobe_command("p:testprobe kprobe_trace_selftest_target "
-				  "$stack $stack0 +0($stack)",
-				  create_trace_kprobe);
+	ret = trace_run_command("p:testprobe kprobe_trace_selftest_target "
+				"$stack $stack0 +0($stack)",
+				create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function entry.\n");
 		warn++;
@@ -1455,8 +1453,8 @@ static __init int kprobe_trace_self_tests_init(void)
 		}
 	}
 
-	ret = traceprobe_command("r:testprobe2 kprobe_trace_selftest_target "
-				  "$retval", create_trace_kprobe);
+	ret = trace_run_command("r:testprobe2 kprobe_trace_selftest_target "
+				"$retval", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on probing function return.\n");
 		warn++;
@@ -1526,13 +1524,13 @@ static __init int kprobe_trace_self_tests_init(void)
 			disable_trace_kprobe(tk, file);
 	}
 
-	ret = traceprobe_command("-:testprobe", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
 	}
 
-	ret = traceprobe_command("-:testprobe2", create_trace_kprobe);
+	ret = trace_run_command("-:testprobe2", create_trace_kprobe);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warn("error on deleting a probe.\n");
 		warn++;
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 52478f033f88..d59357308677 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -623,92 +623,6 @@ void traceprobe_free_probe_arg(struct probe_arg *arg)
 	kfree(arg->comm);
 }
 
-int traceprobe_command(const char *buf, int (*createfn)(int, char **))
-{
-	char **argv;
-	int argc, ret;
-
-	argc = 0;
-	ret = 0;
-	argv = argv_split(GFP_KERNEL, buf, &argc);
-	if (!argv)
-		return -ENOMEM;
-
-	if (argc)
-		ret = createfn(argc, argv);
-
-	argv_free(argv);
-
-	return ret;
-}
-
-#define WRITE_BUFSIZE  4096
-
-ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer,
-				size_t count, loff_t *ppos,
-				int (*createfn)(int, char **))
-{
-	char *kbuf, *buf, *tmp;
-	int ret = 0;
-	size_t done = 0;
-	size_t size;
-
-	kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
-	if (!kbuf)
-		return -ENOMEM;
-
-	while (done < count) {
-		size = count - done;
-
-		if (size >= WRITE_BUFSIZE)
-			size = WRITE_BUFSIZE - 1;
-
-		if (copy_from_user(kbuf, buffer + done, size)) {
-			ret = -EFAULT;
-			goto out;
-		}
-		kbuf[size] = '\0';
-		buf = kbuf;
-		do {
-			tmp = strchr(buf, '\n');
-			if (tmp) {
-				*tmp = '\0';
-				size = tmp - buf + 1;
-			} else {
-				size = strlen(buf);
-				if (done + size < count) {
-					if (buf != kbuf)
-						break;
-					/* This can accept WRITE_BUFSIZE - 2 ('\n' + '\0') */
-					pr_warn("Line length is too long: Should be less than %d\n",
-						WRITE_BUFSIZE - 2);
-					ret = -EINVAL;
-					goto out;
-				}
-			}
-			done += size;
-
-			/* Remove comments */
-			tmp = strchr(buf, '#');
-
-			if (tmp)
-				*tmp = '\0';
-
-			ret = traceprobe_command(buf, createfn);
-			if (ret)
-				goto out;
-			buf += size;
-
-		} while (done < count);
-	}
-	ret = done;
-
-out:
-	kfree(kbuf);
-
-	return ret;
-}
-
 static int __set_print_fmt(struct trace_probe *tp, char *buf, int len,
 			   bool is_return)
 {
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 903273c93e61..fb66e3eaa192 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -42,7 +42,6 @@
 
 #define MAX_TRACE_ARGS		128
 #define MAX_ARGSTR_LEN		63
-#define MAX_EVENT_NAME_LEN	64
 #define MAX_STRING_SIZE		PATH_MAX
 
 /* Reserved field names */
@@ -356,12 +355,6 @@ extern void traceprobe_free_probe_arg(struct probe_arg *arg);
 
 extern int traceprobe_split_symbol_offset(char *symbol, unsigned long *offset);
 
-extern ssize_t traceprobe_probes_write(struct file *file,
-		const char __user *buffer, size_t count, loff_t *ppos,
-		int (*createfn)(int, char**));
-
-extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
-
 /* Sum up total data length for dynamic arraies (strings) */
 static nokprobe_inline int
 __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index cd70eb5df38e..11e9daa4a568 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -60,7 +60,7 @@ static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
  * Test the trace buffer to see if all the elements
  * are still sane.
  */
-static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
+static int __maybe_unused trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
 {
 	unsigned long flags, cnt = 0;
 	int cpu, ret = 0;
@@ -1151,38 +1151,6 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_SCHED_TRACER */
 
-#ifdef CONFIG_CONTEXT_SWITCH_TRACER
-int
-trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
-{
-	unsigned long count;
-	int ret;
-
-	/* start the tracing */
-	ret = tracer_init(trace, tr);
-	if (ret) {
-		warn_failed_init_tracer(trace, ret);
-		return ret;
-	}
-
-	/* Sleep for a 1/10 of a second */
-	msleep(100);
-	/* stop the tracing. */
-	tracing_stop();
-	/* check the trace buffer */
-	ret = trace_test_buffer(&tr->trace_buffer, &count);
-	trace->reset(tr);
-	tracing_start();
-
-	if (!ret && !count) {
-		printk(KERN_CONT ".. no entries found ..");
-		ret = -1;
-	}
-
-	return ret;
-}
-#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
-
 #ifdef CONFIG_BRANCH_TRACER
 int
 trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a642f2c64f..f93a56d2db27 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -560,9 +560,10 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
 static int sys_perf_refcount_enter;
 static int sys_perf_refcount_exit;
 
-static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_metadata *sys_data,
-			      struct syscall_trace_enter *rec) {
+static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
+			       struct syscall_metadata *sys_data,
+			       struct syscall_trace_enter *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -574,7 +575,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
 	param.syscall_nr = rec->nr;
 	for (i = 0; i < sys_data->nb_args; i++)
 		param.args[i] = rec->args[i];
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
@@ -582,7 +583,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_enter *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -597,9 +598,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->enter_event->prog);
 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* get the size after alignment with the u32 buffer size field */
@@ -615,7 +616,8 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
 			       (unsigned long *)&rec->args);
 
-	if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_enter(sys_data->enter_event, regs, sys_data, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
@@ -623,7 +625,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
 
 	perf_trace_buf_submit(rec, size, rctx,
 			      sys_data->enter_event->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
 }
 
 static int perf_sysenter_enable(struct trace_event_call *call)
@@ -660,8 +662,9 @@ static void perf_sysenter_disable(struct trace_event_call *call)
 	mutex_unlock(&syscall_trace_lock);
 }
 
-static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
-			      struct syscall_trace_exit *rec) {
+static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
+			      struct syscall_trace_exit *rec)
+{
 	struct syscall_tp_t {
 		unsigned long long regs;
 		unsigned long syscall_nr;
@@ -671,7 +674,7 @@ static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
 	*(struct pt_regs **)&param = regs;
 	param.syscall_nr = rec->nr;
 	param.ret = rec->ret;
-	return trace_call_bpf(prog, &param);
+	return trace_call_bpf(call, &param);
 }
 
 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -679,7 +682,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	struct syscall_metadata *sys_data;
 	struct syscall_trace_exit *rec;
 	struct hlist_head *head;
-	struct bpf_prog *prog;
+	bool valid_prog_array;
 	int syscall_nr;
 	int rctx;
 	int size;
@@ -694,9 +697,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	prog = READ_ONCE(sys_data->exit_event->prog);
 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
-	if (!prog && hlist_empty(head))
+	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
+	if (!valid_prog_array && hlist_empty(head))
 		return;
 
 	/* We can probably do that at build time */
@@ -710,14 +713,15 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
 	rec->nr = syscall_nr;
 	rec->ret = syscall_get_return_value(current, regs);
 
-	if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
+	if ((valid_prog_array &&
+	     !perf_call_bpf_exit(sys_data->exit_event, regs, rec)) ||
 	    hlist_empty(head)) {
 		perf_swevent_put_recursion_context(rctx);
 		return;
 	}
 
 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
-			      1, regs, head, NULL, NULL);
+			      1, regs, head, NULL);
 }
 
 static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4525e0271a53..40592e7b3568 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -651,7 +651,7 @@ static int probes_open(struct inode *inode, struct file *file)
 static ssize_t probes_write(struct file *file, const char __user *buffer,
 			    size_t count, loff_t *ppos)
 {
-	return traceprobe_probes_write(file, buffer, count, ppos, create_trace_uprobe);
+	return trace_parse_run_command(file, buffer, count, ppos, create_trace_uprobe);
 }
 
 static const struct file_operations uprobe_events_ops = {
@@ -1113,13 +1113,12 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 {
 	struct trace_event_call *call = &tu->tp.call;
 	struct uprobe_trace_entry_head *entry;
-	struct bpf_prog *prog = call->prog;
 	struct hlist_head *head;
 	void *data;
 	int size, esize;
 	int rctx;
 
-	if (prog && !trace_call_bpf(prog, regs))
+	if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
 		return;
 
 	esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
@@ -1156,7 +1155,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
 	}
 
 	perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
-			      head, NULL, NULL);
+			      head, NULL);
  out:
 	preempt_enable();
 }
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 305039b122fa..07e75344725b 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -428,7 +428,8 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
 
 		if (test_key && test_key == key_hash && entry->val &&
 		    keys_match(key, entry->val->key, map->key_size)) {
-			atomic64_inc(&map->hits);
+			if (!lookup_only)
+				atomic64_inc(&map->hits);
 			return entry->val;
 		}
 
diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h
index ab0ca77331d0..5b5bbf8ae550 100644
--- a/kernel/trace/tracing_map.h
+++ b/kernel/trace/tracing_map.h
@@ -6,7 +6,7 @@
 #define TRACING_MAP_BITS_MAX		17
 #define TRACING_MAP_BITS_MIN		7
 
-#define TRACING_MAP_KEYS_MAX		2
+#define TRACING_MAP_KEYS_MAX		3
 #define TRACING_MAP_VALS_MAX		3
 #define TRACING_MAP_FIELDS_MAX		(TRACING_MAP_KEYS_MAX + \
 					 TRACING_MAP_VALS_MAX)
diff --git a/kernel/umh.c b/kernel/umh.c
index 6ff9905250ff..18e5fa4b0e71 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -537,14 +537,14 @@ static int proc_cap_handler(struct ctl_table *table, int write,
 	/*
 	 * Drop everything not in the new_cap (but don't add things)
 	 */
-	spin_lock(&umh_sysctl_lock);
 	if (write) {
+		spin_lock(&umh_sysctl_lock);
 		if (table->data == CAP_BSET)
 			usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
 		if (table->data == CAP_PI)
 			usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+		spin_unlock(&umh_sysctl_lock);
 	}
-	spin_unlock(&umh_sysctl_lock);
 
 	return 0;
 }
diff --git a/kernel/user.c b/kernel/user.c
index 00281add65b2..9a20acce460d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -26,26 +26,32 @@
 struct user_namespace init_user_ns = {
 	.uid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.gid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.projid_map = {
 		.nr_extents = 1,
-		.extent[0] = {
-			.first = 0,
-			.lower_first = 0,
-			.count = 4294967295U,
+		{
+			.extent[0] = {
+				.first = 0,
+				.lower_first = 0,
+				.count = 4294967295U,
+			},
 		},
 	},
 	.count = ATOMIC_INIT(3),
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d32b45662fb6..246d4d4ce5c7 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -23,6 +23,8 @@
 #include <linux/ctype.h>
 #include <linux/projid.h>
 #include <linux/fs_struct.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
 
 static struct kmem_cache *user_ns_cachep __read_mostly;
 static DEFINE_MUTEX(userns_state_mutex);
@@ -181,6 +183,18 @@ static void free_user_ns(struct work_struct *work)
 	do {
 		struct ucounts *ucounts = ns->ucounts;
 		parent = ns->parent;
+		if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->gid_map.forward);
+			kfree(ns->gid_map.reverse);
+		}
+		if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->uid_map.forward);
+			kfree(ns->uid_map.reverse);
+		}
+		if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+			kfree(ns->projid_map.forward);
+			kfree(ns->projid_map.reverse);
+		}
 		retire_userns_sysctls(ns);
 #ifdef CONFIG_PERSISTENT_KEYRINGS
 		key_put(ns->persistent_keyring_register);
@@ -198,26 +212,101 @@ void __put_user_ns(struct user_namespace *ns)
 }
 EXPORT_SYMBOL(__put_user_ns);
 
-static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+/**
+ * idmap_key struct holds the information necessary to find an idmapping in a
+ * sorted idmap array. It is passed to cmp_map_id() as first argument.
+ */
+struct idmap_key {
+	bool map_up; /* true  -> id from kid; false -> kid from id */
+	u32 id; /* id to find */
+	u32 count; /* == 0 unless used with map_id_range_down() */
+};
+
+/**
+ * cmp_map_id - Function to be passed to bsearch() to find the requested
+ * idmapping. Expects struct idmap_key to be passed via @k.
+ */
+static int cmp_map_id(const void *k, const void *e)
 {
-	unsigned idx, extents;
+	u32 first, last, id2;
+	const struct idmap_key *key = k;
+	const struct uid_gid_extent *el = e;
+
+	id2 = key->id + key->count - 1;
+
+	/* handle map_id_{down,up}() */
+	if (key->map_up)
+		first = el->lower_first;
+	else
+		first = el->first;
+
+	last = first + el->count - 1;
+
+	if (key->id >= first && key->id <= last &&
+	    (id2 >= first && id2 <= last))
+		return 0;
+
+	if (key->id < first || id2 < first)
+		return -1;
+
+	return 1;
+}
+
+/**
+ * map_id_range_down_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
+{
+	struct idmap_key key;
+
+	key.map_up = false;
+	key.count = count;
+	key.id = id;
+
+	return bsearch(&key, map->forward, extents,
+		       sizeof(struct uid_gid_extent), cmp_map_id);
+}
+
+/**
+ * map_id_range_down_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count)
+{
+	unsigned idx;
 	u32 first, last, id2;
 
 	id2 = id + count - 1;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
 		first = map->extent[idx].first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last &&
 		    (id2 >= first && id2 <= last))
-			break;
+			return &map->extent[idx];
 	}
+	return NULL;
+}
+
+static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
+{
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
+	smp_rmb();
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		extent = map_id_range_down_base(extents, map, id, count);
+	else
+		extent = map_id_range_down_max(extents, map, id, count);
+
 	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].lower_first;
+	if (extent)
+		id = (id - extent->first) + extent->lower_first;
 	else
 		id = (u32) -1;
 
@@ -226,44 +315,61 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count)
 
 static u32 map_id_down(struct uid_gid_map *map, u32 id)
 {
-	unsigned idx, extents;
+	return map_id_range_down(map, id, 1);
+}
+
+/**
+ * map_id_up_base - Find idmap via binary search in static extent array.
+ * Can only be called if number of mappings is equal or less than
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_up_base(unsigned extents, struct uid_gid_map *map, u32 id)
+{
+	unsigned idx;
 	u32 first, last;
 
 	/* Find the matching extent */
-	extents = map->nr_extents;
-	smp_rmb();
 	for (idx = 0; idx < extents; idx++) {
-		first = map->extent[idx].first;
+		first = map->extent[idx].lower_first;
 		last = first + map->extent[idx].count - 1;
 		if (id >= first && id <= last)
-			break;
+			return &map->extent[idx];
 	}
-	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].lower_first;
-	else
-		id = (u32) -1;
+	return NULL;
+}
 
-	return id;
+/**
+ * map_id_up_max - Find idmap via binary search in ordered idmap array.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static struct uid_gid_extent *
+map_id_up_max(unsigned extents, struct uid_gid_map *map, u32 id)
+{
+	struct idmap_key key;
+
+	key.map_up = true;
+	key.count = 1;
+	key.id = id;
+
+	return bsearch(&key, map->reverse, extents,
+		       sizeof(struct uid_gid_extent), cmp_map_id);
 }
 
 static u32 map_id_up(struct uid_gid_map *map, u32 id)
 {
-	unsigned idx, extents;
-	u32 first, last;
-
-	/* Find the matching extent */
-	extents = map->nr_extents;
+	struct uid_gid_extent *extent;
+	unsigned extents = map->nr_extents;
 	smp_rmb();
-	for (idx = 0; idx < extents; idx++) {
-		first = map->extent[idx].lower_first;
-		last = first + map->extent[idx].count - 1;
-		if (id >= first && id <= last)
-			break;
-	}
+
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		extent = map_id_up_base(extents, map, id);
+	else
+		extent = map_id_up_max(extents, map, id);
+
 	/* Map the id or note failure */
-	if (idx < extents)
-		id = (id - first) + map->extent[idx].first;
+	if (extent)
+		id = (id - extent->lower_first) + extent->first;
 	else
 		id = (u32) -1;
 
@@ -540,13 +646,17 @@ static int projid_m_show(struct seq_file *seq, void *v)
 static void *m_start(struct seq_file *seq, loff_t *ppos,
 		     struct uid_gid_map *map)
 {
-	struct uid_gid_extent *extent = NULL;
 	loff_t pos = *ppos;
+	unsigned extents = map->nr_extents;
+	smp_rmb();
 
-	if (pos < map->nr_extents)
-		extent = &map->extent[pos];
+	if (pos >= extents)
+		return NULL;
 
-	return extent;
+	if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return &map->extent[pos];
+
+	return &map->forward[pos];
 }
 
 static void *uid_m_start(struct seq_file *seq, loff_t *ppos)
@@ -618,7 +728,10 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 		u32 prev_upper_last, prev_lower_last;
 		struct uid_gid_extent *prev;
 
-		prev = &new_map->extent[idx];
+		if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			prev = &new_map->extent[idx];
+		else
+			prev = &new_map->forward[idx];
 
 		prev_upper_first = prev->first;
 		prev_lower_first = prev->lower_first;
@@ -638,6 +751,101 @@ static bool mappings_overlap(struct uid_gid_map *new_map,
 	return false;
 }
 
+/**
+ * insert_extent - Safely insert a new idmap extent into struct uid_gid_map.
+ * Takes care to allocate a 4K block of memory if the number of mappings exceeds
+ * UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent)
+{
+	struct uid_gid_extent *dest;
+
+	if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) {
+		struct uid_gid_extent *forward;
+
+		/* Allocate memory for 340 mappings. */
+		forward = kmalloc(sizeof(struct uid_gid_extent) *
+				 UID_GID_MAP_MAX_EXTENTS, GFP_KERNEL);
+		if (!forward)
+			return -ENOMEM;
+
+		/* Copy over memory. Only set up memory for the forward pointer.
+		 * Defer the memory setup for the reverse pointer.
+		 */
+		memcpy(forward, map->extent,
+		       map->nr_extents * sizeof(map->extent[0]));
+
+		map->forward = forward;
+		map->reverse = NULL;
+	}
+
+	if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS)
+		dest = &map->extent[map->nr_extents];
+	else
+		dest = &map->forward[map->nr_extents];
+
+	*dest = *extent;
+	map->nr_extents++;
+	return 0;
+}
+
+/* cmp function to sort() forward mappings */
+static int cmp_extents_forward(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->first < e2->first)
+		return -1;
+
+	if (e1->first > e2->first)
+		return 1;
+
+	return 0;
+}
+
+/* cmp function to sort() reverse mappings */
+static int cmp_extents_reverse(const void *a, const void *b)
+{
+	const struct uid_gid_extent *e1 = a;
+	const struct uid_gid_extent *e2 = b;
+
+	if (e1->lower_first < e2->lower_first)
+		return -1;
+
+	if (e1->lower_first > e2->lower_first)
+		return 1;
+
+	return 0;
+}
+
+/**
+ * sort_idmaps - Sorts an array of idmap entries.
+ * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS.
+ */
+static int sort_idmaps(struct uid_gid_map *map)
+{
+	if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+		return 0;
+
+	/* Sort forward array. */
+	sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_forward, NULL);
+
+	/* Only copy the memory from forward we actually need. */
+	map->reverse = kmemdup(map->forward,
+			       map->nr_extents * sizeof(struct uid_gid_extent),
+			       GFP_KERNEL);
+	if (!map->reverse)
+		return -ENOMEM;
+
+	/* Sort reverse array. */
+	sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent),
+	     cmp_extents_reverse, NULL);
+
+	return 0;
+}
+
 static ssize_t map_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos,
 			 int cap_setid,
@@ -648,7 +856,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	struct user_namespace *ns = seq->private;
 	struct uid_gid_map new_map;
 	unsigned idx;
-	struct uid_gid_extent *extent = NULL;
+	struct uid_gid_extent extent;
 	char *kbuf = NULL, *pos, *next_line;
 	ssize_t ret = -EINVAL;
 
@@ -673,6 +881,8 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	 */
 	mutex_lock(&userns_state_mutex);
 
+	memset(&new_map, 0, sizeof(struct uid_gid_map));
+
 	ret = -EPERM;
 	/* Only allow one successful write to the map */
 	if (map->nr_extents != 0)
@@ -700,9 +910,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	/* Parse the user data */
 	ret = -EINVAL;
 	pos = kbuf;
-	new_map.nr_extents = 0;
 	for (; pos; pos = next_line) {
-		extent = &new_map.extent[new_map.nr_extents];
 
 		/* Find the end of line and ensure I don't look past it */
 		next_line = strchr(pos, '\n');
@@ -714,17 +922,17 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		}
 
 		pos = skip_spaces(pos);
-		extent->first = simple_strtoul(pos, &pos, 10);
+		extent.first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->lower_first = simple_strtoul(pos, &pos, 10);
+		extent.lower_first = simple_strtoul(pos, &pos, 10);
 		if (!isspace(*pos))
 			goto out;
 
 		pos = skip_spaces(pos);
-		extent->count = simple_strtoul(pos, &pos, 10);
+		extent.count = simple_strtoul(pos, &pos, 10);
 		if (*pos && !isspace(*pos))
 			goto out;
 
@@ -734,29 +942,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 			goto out;
 
 		/* Verify we have been given valid starting values */
-		if ((extent->first == (u32) -1) ||
-		    (extent->lower_first == (u32) -1))
+		if ((extent.first == (u32) -1) ||
+		    (extent.lower_first == (u32) -1))
 			goto out;
 
 		/* Verify count is not zero and does not cause the
 		 * extent to wrap
 		 */
-		if ((extent->first + extent->count) <= extent->first)
+		if ((extent.first + extent.count) <= extent.first)
 			goto out;
-		if ((extent->lower_first + extent->count) <=
-		     extent->lower_first)
+		if ((extent.lower_first + extent.count) <=
+		     extent.lower_first)
 			goto out;
 
 		/* Do the ranges in extent overlap any previous extents? */
-		if (mappings_overlap(&new_map, extent))
+		if (mappings_overlap(&new_map, &extent))
 			goto out;
 
-		new_map.nr_extents++;
-
-		/* Fail if the file contains too many extents */
-		if ((new_map.nr_extents == UID_GID_MAP_MAX_EXTENTS) &&
+		if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS &&
 		    (next_line != NULL))
 			goto out;
+
+		ret = insert_extent(&new_map, &extent);
+		if (ret < 0)
+			goto out;
+		ret = -EINVAL;
 	}
 	/* Be very certaint the new map actually exists */
 	if (new_map.nr_extents == 0)
@@ -767,16 +977,26 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 	if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
 		goto out;
 
+	ret = sort_idmaps(&new_map);
+	if (ret < 0)
+		goto out;
+
+	ret = -EPERM;
 	/* Map the lower ids from the parent user namespace to the
 	 * kernel global id space.
 	 */
 	for (idx = 0; idx < new_map.nr_extents; idx++) {
+		struct uid_gid_extent *e;
 		u32 lower_first;
-		extent = &new_map.extent[idx];
+
+		if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS)
+			e = &new_map.extent[idx];
+		else
+			e = &new_map.forward[idx];
 
 		lower_first = map_id_range_down(parent_map,
-						extent->lower_first,
-						extent->count);
+						e->lower_first,
+						e->count);
 
 		/* Fail if we can not map the specified extent to
 		 * the kernel global id space.
@@ -784,18 +1004,31 @@ static ssize_t map_write(struct file *file, const char __user *buf,
 		if (lower_first == (u32) -1)
 			goto out;
 
-		extent->lower_first = lower_first;
+		e->lower_first = lower_first;
 	}
 
 	/* Install the map */
-	memcpy(map->extent, new_map.extent,
-		new_map.nr_extents*sizeof(new_map.extent[0]));
+	if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+		memcpy(map->extent, new_map.extent,
+		       new_map.nr_extents * sizeof(new_map.extent[0]));
+	} else {
+		map->forward = new_map.forward;
+		map->reverse = new_map.reverse;
+	}
 	smp_wmb();
 	map->nr_extents = new_map.nr_extents;
 
 	*ppos = count;
 	ret = count;
 out:
+	if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+		kfree(new_map.forward);
+		kfree(new_map.reverse);
+		map->forward = NULL;
+		map->reverse = NULL;
+		map->nr_extents = 0;
+	}
+
 	mutex_unlock(&userns_state_mutex);
 	kfree(kbuf);
 	return ret;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7368b57842ea..8fdb710bfdd7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1509,7 +1509,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
 	struct work_struct *work = &dwork->work;
 
 	WARN_ON_ONCE(!wq);
-	WARN_ON_ONCE(timer->function != (TIMER_FUNC_TYPE)delayed_work_timer_fn);
+	WARN_ON_ONCE(timer->function != delayed_work_timer_fn);
 	WARN_ON_ONCE(timer_pending(timer));
 	WARN_ON_ONCE(!list_empty(&work->entry));
 
@@ -4990,9 +4990,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
  *
  * Unbound workqueues have the following extra attributes.
  *
- *  id		RO int	: the associated pool ID
+ *  pool_ids	RO int	: the associated pool IDs for each node
  *  nice	RW int	: nice value of the workers
  *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ *  numa	RW bool	: whether enable NUMA affinity
  */
 struct wq_device {
 	struct workqueue_struct		*wq;
author	Ingo Molnar <mingo@kernel.org>	2017-12-06 22:39:39 +0100
committer	Ingo Molnar <mingo@kernel.org>	2017-12-06 22:39:39 +0100
commit	d6eabce2577a695197e9433302fd6a9f0e1a7666 (patch)
tree	e5c5bcc7803879d5d911cc8faeb74a946c242395 /kernel
parent	Merge branch 'perf/urgent' of git://git.kernel.org/pub/scm/linux/kernel/git/a... (diff)
parent	x86: don't hash faulting address in oops printout (diff)
download	linux-d6eabce2577a695197e9433302fd6a9f0e1a7666.tar.xz linux-d6eabce2577a695197e9433302fd6a9f0e1a7666.zip