summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/audit.c48
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/audit_tree.c6
-rw-r--r--kernel/auditfilter.c14
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/arraymap.c26
-rw-r--r--kernel/bpf/core.c33
-rw-r--r--kernel/bpf/hashtab.c18
-rw-r--r--kernel/bpf/helpers.c7
-rw-r--r--kernel/bpf/inode.c387
-rw-r--r--kernel/bpf/syscall.c174
-rw-r--r--kernel/bpf/verifier.c116
-rw-r--r--kernel/cgroup.c1299
-rw-r--r--kernel/cgroup_pids.c8
-rw-r--r--kernel/context_tracking.c80
-rw-r--r--kernel/cpuset.c86
-rw-r--r--kernel/events/core.c23
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/futex.c4
-rw-r--r--kernel/irq/cpuhotplug.c2
-rw-r--r--kernel/irq/internals.h5
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/irq/pm.c5
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kexec_core.c4
-rw-r--r--kernel/kexec_file.c2
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/memremap.c16
-rw-r--r--kernel/module_signing.c1
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c20
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c17
-rw-r--r--kernel/power/snapshot.c2
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/power/swap.c16
-rw-r--r--kernel/printk/printk.c14
-rw-r--r--kernel/ptrace.c5
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/seccomp.c78
-rw-r--r--kernel/signal.c53
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/sysctl.c33
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/time/timer.c22
-rw-r--r--kernel/trace/Kconfig9
-rw-r--r--kernel/trace/blktrace.c27
-rw-r--r--kernel/trace/bpf_trace.c55
-rw-r--r--kernel/trace/ftrace.c197
-rw-r--r--kernel/trace/ring_buffer.c20
-rw-r--r--kernel/trace/ring_buffer_benchmark.c79
-rw-r--r--kernel/trace/trace.c470
-rw-r--r--kernel/trace/trace.h169
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_branch.c15
-rw-r--r--kernel/trace/trace_events.c506
-rw-r--r--kernel/trace/trace_events_filter.c8
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_functions_graph.c69
-rw-r--r--kernel/trace/trace_irqsoff.c106
-rw-r--r--kernel/trace/trace_kdb.c8
-rw-r--r--kernel/trace/trace_mmiotrace.c4
-rw-r--r--kernel/trace/trace_output.c97
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_printk.c14
-rw-r--r--kernel/trace/trace_probe.h8
-rw-r--r--kernel/trace/trace_sched_wakeup.c120
-rw-r--r--kernel/trace/trace_stack.c92
-rw-r--r--kernel/trace/trace_syscalls.c3
-rw-r--r--kernel/tracepoint.c61
-rw-r--r--kernel/watchdog.c121
-rw-r--r--kernel/workqueue.c26
78 files changed, 3357 insertions, 1626 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index 790d83c7d160..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -5,4 +5,3 @@ config_data.h
config_data.gz
timeconst.h
hz.bc
-x509_certificate_list
diff --git a/kernel/audit.c b/kernel/audit.c
index 662c007635fb..5ffcbd354a52 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -407,16 +407,33 @@ static void audit_printk_skb(struct sk_buff *skb)
static void kauditd_send_skb(struct sk_buff *skb)
{
int err;
+ int attempts = 0;
+#define AUDITD_RETRIES 5
+
+restart:
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
- BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */
+ pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
+ audit_pid, err);
if (audit_pid) {
- pr_err("*NO* daemon at audit_pid=%d\n", audit_pid);
- audit_log_lost("auditd disappeared");
- audit_pid = 0;
- audit_sock = NULL;
+ if (err == -ECONNREFUSED || err == -EPERM
+ || ++attempts >= AUDITD_RETRIES) {
+ char s[32];
+
+ snprintf(s, sizeof(s), "audit_pid=%d reset", audit_pid);
+ audit_log_lost(s);
+ audit_pid = 0;
+ audit_sock = NULL;
+ } else {
+ pr_warn("re-scheduling(#%d) write to audit_pid=%d\n",
+ attempts, audit_pid);
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ __set_current_state(TASK_RUNNING);
+ goto restart;
+ }
}
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
@@ -684,25 +701,22 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
return err;
}
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
+static void audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
{
- int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
pid_t pid = task_tgid_nr(current);
if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
- return rc;
+ return;
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
if (unlikely(!*ab))
- return rc;
+ return;
audit_log_format(*ab, "pid=%d uid=%u", pid, uid);
audit_log_session_info(*ab);
audit_log_task_context(*ab);
-
- return rc;
}
int is_audit_feature_set(int i)
@@ -1357,16 +1371,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
if (unlikely(audit_filter_type(type)))
return NULL;
- if (gfp_mask & __GFP_WAIT) {
+ if (gfp_mask & __GFP_DIRECT_RECLAIM) {
if (audit_pid && audit_pid == current->pid)
- gfp_mask &= ~__GFP_WAIT;
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
else
reserve = 0;
}
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) {
long sleep_time;
sleep_time = timeout_start + audit_backlog_wait_time - jiffies;
@@ -1566,14 +1580,14 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
* @string: string to be checked
* @len: max length of the string to check
*/
-int audit_string_contains_control(const char *string, size_t len)
+bool audit_string_contains_control(const char *string, size_t len)
{
const unsigned char *p;
for (p = string; p < (const unsigned char *)string + len; p++) {
if (*p == '"' || *p < 0x21 || *p > 0x7e)
- return 1;
+ return true;
}
- return 0;
+ return false;
}
/**
diff --git a/kernel/audit.h b/kernel/audit.h
index dadf86a0e59e..de6cbb7cf547 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -301,7 +301,7 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#ifdef CONFIG_AUDIT_TREE
extern struct audit_chunk *audit_tree_lookup(const struct inode *);
extern void audit_put_chunk(struct audit_chunk *);
-extern int audit_tree_match(struct audit_chunk *, struct audit_tree *);
+extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
extern int audit_make_tree(struct audit_krule *, char *, u32);
extern int audit_add_tree_rule(struct audit_krule *);
extern int audit_remove_tree_rule(struct audit_krule *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 94ecdabda8e6..5efe9b299a12 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -197,13 +197,13 @@ struct audit_chunk *audit_tree_lookup(const struct inode *inode)
return NULL;
}
-int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
+bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
{
int n;
for (n = 0; n < chunk->count; n++)
if (chunk->owners[n].owner == tree)
- return 1;
- return 0;
+ return true;
+ return false;
}
/* tagging and untagging inodes with trees */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7714d93edb85..b8ff9e193753 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -39,13 +39,13 @@
* Locking model:
*
* audit_filter_mutex:
- * Synchronizes writes and blocking reads of audit's filterlist
- * data. Rcu is used to traverse the filterlist and access
- * contents of structs audit_entry, audit_watch and opaque
- * LSM rules during filtering. If modified, these structures
- * must be copied and replace their counterparts in the filterlist.
- * An audit_parent struct is not accessed during filtering, so may
- * be written directly provided audit_filter_mutex is held.
+ * Synchronizes writes and blocking reads of audit's filterlist
+ * data. Rcu is used to traverse the filterlist and access
+ * contents of structs audit_entry, audit_watch and opaque
+ * LSM rules during filtering. If modified, these structures
+ * must be copied and replace their counterparts in the filterlist.
+ * An audit_parent struct is not accessed during filtering, so may
+ * be written directly provided audit_filter_mutex is held.
*/
/* Audit filter lists, defined in <linux/audit.h> */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index e6983be12bd3..13272582eee0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,2 +1,4 @@
obj-y := core.o
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
+
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
+obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace107f236..3f4c99e06c6b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -15,6 +15,7 @@
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
+#include <linux/perf_event.h>
/* Called from syscall */
static struct bpf_map *array_map_alloc(union bpf_attr *attr)
@@ -48,7 +49,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
array->map.key_size = attr->key_size;
array->map.value_size = attr->value_size;
array->map.max_entries = attr->max_entries;
-
+ array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT;
array->elem_size = elem_size;
return &array->map;
@@ -291,14 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
attr = perf_event_attrs(event);
if (IS_ERR(attr))
- return (void *)attr;
+ goto err;
- if (attr->type != PERF_TYPE_RAW &&
- attr->type != PERF_TYPE_HARDWARE) {
- perf_event_release_kernel(event);
- return ERR_PTR(-EINVAL);
- }
- return event;
+ if (attr->inherit)
+ goto err;
+
+ if (attr->type == PERF_TYPE_RAW)
+ return event;
+
+ if (attr->type == PERF_TYPE_HARDWARE)
+ return event;
+
+ if (attr->type == PERF_TYPE_SOFTWARE &&
+ attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+ return event;
+err:
+ perf_event_release_kernel(event);
+ return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 67c380cfa9ca..334b1bdd572c 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -82,6 +82,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
if (fp == NULL)
return NULL;
+ kmemcheck_annotate_bitfield(fp, meta);
+
aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
if (aux == NULL) {
vfree(fp);
@@ -90,6 +92,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
fp->pages = size / PAGE_SIZE;
fp->aux = aux;
+ fp->aux->prog = fp;
return fp;
}
@@ -110,8 +113,11 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
if (fp != NULL) {
+ kmemcheck_annotate_bitfield(fp, meta);
+
memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
fp->pages = size / PAGE_SIZE;
+ fp->aux->prog = fp;
/* We keep fp->aux from fp_old around in the new
* reallocated structure.
@@ -722,11 +728,36 @@ void bpf_prog_free(struct bpf_prog *fp)
struct bpf_prog_aux *aux = fp->aux;
INIT_WORK(&aux->work, bpf_prog_free_deferred);
- aux->prog = fp;
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+/* RNG for unpriviledged user space with separated state from prandom_u32(). */
+static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state);
+
+void bpf_user_rnd_init_once(void)
+{
+ prandom_init_once(&bpf_user_rnd_state);
+}
+
+u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ /* Should someone ever have the rather unwise idea to use some
+ * of the registers passed into this function, then note that
+ * this function is called from native eBPF and classic-to-eBPF
+ * transformations. Register assignments from both sides are
+ * different, f.e. classic always sets fn(ctx, A, X) here.
+ */
+ struct rnd_state *state;
+ u32 res;
+
+ state = &get_cpu_var(bpf_user_rnd_state);
+ res = prandom_u32_state(state);
+ put_cpu_var(state);
+
+ return res;
+}
+
/* Weak definitions of helper functions in case we don't have bpf syscall. */
const struct bpf_func_proto bpf_map_lookup_elem_proto __weak;
const struct bpf_func_proto bpf_map_update_elem_proto __weak;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 83c209d9b17a..19909b22b4f8 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -17,7 +17,7 @@
struct bpf_htab {
struct bpf_map map;
struct hlist_head *buckets;
- spinlock_t lock;
+ raw_spinlock_t lock;
u32 count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */
@@ -82,12 +82,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
for (i = 0; i < htab->n_buckets; i++)
INIT_HLIST_HEAD(&htab->buckets[i]);
- spin_lock_init(&htab->lock);
+ raw_spin_lock_init(&htab->lock);
htab->count = 0;
htab->elem_size = sizeof(struct htab_elem) +
round_up(htab->map.key_size, 8) +
htab->map.value_size;
+
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->elem_size * htab->map.max_entries,
+ PAGE_SIZE) >> PAGE_SHIFT;
return &htab->map;
free_htab:
@@ -230,7 +234,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
l_new->hash = htab_map_hash(l_new->key, key_size);
/* bpf_map_update_elem() can be called in_irq() */
- spin_lock_irqsave(&htab->lock, flags);
+ raw_spin_lock_irqsave(&htab->lock, flags);
head = select_bucket(htab, l_new->hash);
@@ -266,11 +270,11 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
} else {
htab->count++;
}
- spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
return 0;
err:
- spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
kfree(l_new);
return ret;
}
@@ -291,7 +295,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
hash = htab_map_hash(key, key_size);
- spin_lock_irqsave(&htab->lock, flags);
+ raw_spin_lock_irqsave(&htab->lock, flags);
head = select_bucket(htab, hash);
@@ -304,7 +308,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
ret = 0;
}
- spin_unlock_irqrestore(&htab->lock, flags);
+ raw_spin_unlock_irqrestore(&htab->lock, flags);
return ret;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 1447ec09421e..4504ca66118d 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -93,13 +93,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = {
.arg2_type = ARG_PTR_TO_MAP_KEY,
};
-static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
-{
- return prandom_u32();
-}
-
const struct bpf_func_proto bpf_get_prandom_u32_proto = {
- .func = bpf_get_prandom_u32,
+ .func = bpf_user_rnd_u32,
.gpl_only = false,
.ret_type = RET_INTEGER,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
new file mode 100644
index 000000000000..be6d726e31c9
--- /dev/null
+++ b/kernel/bpf/inode.c
@@ -0,0 +1,387 @@
+/*
+ * Minimal file system backend for holding eBPF maps and programs,
+ * used by bpf(2) object pinning.
+ *
+ * Authors:
+ *
+ * Daniel Borkmann <daniel@iogearbox.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/magic.h>
+#include <linux/major.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+
+enum bpf_type {
+ BPF_TYPE_UNSPEC = 0,
+ BPF_TYPE_PROG,
+ BPF_TYPE_MAP,
+};
+
+static void *bpf_any_get(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt);
+ break;
+ case BPF_TYPE_MAP:
+ atomic_inc(&((struct bpf_map *)raw)->refcnt);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ return raw;
+}
+
+static void bpf_any_put(void *raw, enum bpf_type type)
+{
+ switch (type) {
+ case BPF_TYPE_PROG:
+ bpf_prog_put(raw);
+ break;
+ case BPF_TYPE_MAP:
+ bpf_map_put(raw);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
+static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type)
+{
+ void *raw;
+
+ *type = BPF_TYPE_MAP;
+ raw = bpf_map_get(ufd);
+ if (IS_ERR(raw)) {
+ *type = BPF_TYPE_PROG;
+ raw = bpf_prog_get(ufd);
+ }
+
+ return raw;
+}
+
+static const struct inode_operations bpf_dir_iops;
+
+static const struct inode_operations bpf_prog_iops = { };
+static const struct inode_operations bpf_map_iops = { };
+
+static struct inode *bpf_get_inode(struct super_block *sb,
+ const struct inode *dir,
+ umode_t mode)
+{
+ struct inode *inode;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ case S_IFREG:
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ inode = new_inode(sb);
+ if (!inode)
+ return ERR_PTR(-ENOSPC);
+
+ inode->i_ino = get_next_ino();
+ inode->i_atime = CURRENT_TIME;
+ inode->i_mtime = inode->i_atime;
+ inode->i_ctime = inode->i_atime;
+
+ inode_init_owner(inode, dir, mode);
+
+ return inode;
+}
+
+static int bpf_inode_type(const struct inode *inode, enum bpf_type *type)
+{
+ *type = BPF_TYPE_UNSPEC;
+ if (inode->i_op == &bpf_prog_iops)
+ *type = BPF_TYPE_PROG;
+ else if (inode->i_op == &bpf_map_iops)
+ *type = BPF_TYPE_MAP;
+ else
+ return -EACCES;
+
+ return 0;
+}
+
+static bool bpf_dname_reserved(const struct dentry *dentry)
+{
+ return strchr(dentry->d_name.name, '.');
+}
+
+static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+ struct inode *inode;
+
+ if (bpf_dname_reserved(dentry))
+ return -EPERM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = &bpf_dir_iops;
+ inode->i_fop = &simple_dir_operations;
+
+ inc_nlink(inode);
+ inc_nlink(dir);
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ return 0;
+}
+
+static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry,
+ umode_t mode, const struct inode_operations *iops)
+{
+ struct inode *inode;
+
+ if (bpf_dname_reserved(dentry))
+ return -EPERM;
+
+ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ inode->i_op = iops;
+ inode->i_private = dentry->d_fsdata;
+
+ d_instantiate(dentry, inode);
+ dget(dentry);
+
+ return 0;
+}
+
+static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode,
+ dev_t devt)
+{
+ enum bpf_type type = MINOR(devt);
+
+ if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) ||
+ dentry->d_fsdata == NULL)
+ return -EPERM;
+
+ switch (type) {
+ case BPF_TYPE_PROG:
+ return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops);
+ case BPF_TYPE_MAP:
+ return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops);
+ default:
+ return -EPERM;
+ }
+}
+
+static const struct inode_operations bpf_dir_iops = {
+ .lookup = simple_lookup,
+ .mknod = bpf_mkobj,
+ .mkdir = bpf_mkdir,
+ .rmdir = simple_rmdir,
+ .unlink = simple_unlink,
+};
+
+static int bpf_obj_do_pin(const struct filename *pathname, void *raw,
+ enum bpf_type type)
+{
+ struct dentry *dentry;
+ struct inode *dir;
+ struct path path;
+ umode_t mode;
+ dev_t devt;
+ int ret;
+
+ dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask());
+ devt = MKDEV(UNNAMED_MAJOR, type);
+
+ ret = security_path_mknod(&path, dentry, mode, devt);
+ if (ret)
+ goto out;
+
+ dir = d_inode(path.dentry);
+ if (dir->i_op != &bpf_dir_iops) {
+ ret = -EPERM;
+ goto out;
+ }
+
+ dentry->d_fsdata = raw;
+ ret = vfs_mknod(dir, dentry, mode, devt);
+ dentry->d_fsdata = NULL;
+out:
+ done_path_create(&path, dentry);
+ return ret;
+}
+
+int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
+{
+ struct filename *pname;
+ enum bpf_type type;
+ void *raw;
+ int ret;
+
+ pname = getname(pathname);
+ if (IS_ERR(pname))
+ return PTR_ERR(pname);
+
+ raw = bpf_fd_probe_obj(ufd, &type);
+ if (IS_ERR(raw)) {
+ ret = PTR_ERR(raw);
+ goto out;
+ }
+
+ ret = bpf_obj_do_pin(pname, raw, type);
+ if (ret != 0)
+ bpf_any_put(raw, type);
+out:
+ putname(pname);
+ return ret;
+}
+
+static void *bpf_obj_do_get(const struct filename *pathname,
+ enum bpf_type *type)
+{
+ struct inode *inode;
+ struct path path;
+ void *raw;
+ int ret;
+
+ ret = kern_path(pathname->name, LOOKUP_FOLLOW, &path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ inode = d_backing_inode(path.dentry);
+ ret = inode_permission(inode, MAY_WRITE);
+ if (ret)
+ goto out;
+
+ ret = bpf_inode_type(inode, type);
+ if (ret)
+ goto out;
+
+ raw = bpf_any_get(inode->i_private, *type);
+ touch_atime(&path);
+
+ path_put(&path);
+ return raw;
+out:
+ path_put(&path);
+ return ERR_PTR(ret);
+}
+
+int bpf_obj_get_user(const char __user *pathname)
+{
+ enum bpf_type type = BPF_TYPE_UNSPEC;
+ struct filename *pname;
+ int ret = -ENOENT;
+ void *raw;
+
+ pname = getname(pathname);
+ if (IS_ERR(pname))
+ return PTR_ERR(pname);
+
+ raw = bpf_obj_do_get(pname, &type);
+ if (IS_ERR(raw)) {
+ ret = PTR_ERR(raw);
+ goto out;
+ }
+
+ if (type == BPF_TYPE_PROG)
+ ret = bpf_prog_new_fd(raw);
+ else if (type == BPF_TYPE_MAP)
+ ret = bpf_map_new_fd(raw);
+ else
+ goto out;
+
+ if (ret < 0)
+ bpf_any_put(raw, type);
+out:
+ putname(pname);
+ return ret;
+}
+
+static void bpf_evict_inode(struct inode *inode)
+{
+ enum bpf_type type;
+
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+}
+
+static const struct super_operations bpf_super_ops = {
+ .statfs = simple_statfs,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = bpf_evict_inode,
+};
+
+static int bpf_fill_super(struct super_block *sb, void *data, int silent)
+{
+ static struct tree_descr bpf_rfiles[] = { { "" } };
+ struct inode *inode;
+ int ret;
+
+ ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles);
+ if (ret)
+ return ret;
+
+ sb->s_op = &bpf_super_ops;
+
+ inode = sb->s_root->d_inode;
+ inode->i_op = &bpf_dir_iops;
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= S_ISVTX | S_IRWXUGO;
+
+ return 0;
+}
+
+static struct dentry *bpf_mount(struct file_system_type *type, int flags,
+ const char *dev_name, void *data)
+{
+ return mount_ns(type, flags, current->nsproxy->mnt_ns, bpf_fill_super);
+}
+
+static struct file_system_type bpf_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "bpf",
+ .mount = bpf_mount,
+ .kill_sb = kill_litter_super,
+ .fs_flags = FS_USERNS_MOUNT,
+};
+
+MODULE_ALIAS_FS("bpf");
+
+static int __init bpf_init(void)
+{
+ int ret;
+
+ ret = sysfs_create_mount_point(fs_kobj, "bpf");
+ if (ret)
+ return ret;
+
+ ret = register_filesystem(&bpf_fs_type);
+ if (ret)
+ sysfs_remove_mount_point(fs_kobj, "bpf");
+
+ return ret;
+}
+fs_initcall(bpf_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 35bac8e8b071..0d3313d02a7e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -18,6 +18,8 @@
#include <linux/filter.h>
#include <linux/version.h>
+int sysctl_unprivileged_bpf_disabled __read_mostly;
+
static LIST_HEAD(bpf_map_types);
static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
@@ -44,11 +46,38 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+static int bpf_map_charge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(map->pages, &user->locked_vm);
+
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ map->user = user;
+ return 0;
+}
+
+static void bpf_map_uncharge_memlock(struct bpf_map *map)
+{
+ struct user_struct *user = map->user;
+
+ atomic_long_sub(map->pages, &user->locked_vm);
+ free_uid(user);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
struct bpf_map *map = container_of(work, struct bpf_map, work);
+ bpf_map_uncharge_memlock(map);
/* implementation dependent freeing */
map->ops->map_free(map);
}
@@ -82,6 +111,12 @@ static const struct file_operations bpf_map_fops = {
.release = bpf_map_release,
};
+int bpf_map_new_fd(struct bpf_map *map)
+{
+ return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
+ O_RDWR | O_CLOEXEC);
+}
+
/* helper macro to check that unused fields 'union bpf_attr' are zero */
#define CHECK_ATTR(CMD) \
memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
@@ -108,8 +143,11 @@ static int map_create(union bpf_attr *attr)
atomic_set(&map->refcnt, 1);
- err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+ err = bpf_map_charge_memlock(map);
+ if (err)
+ goto free_map;
+ err = bpf_map_new_fd(map);
if (err < 0)
/* failed to allocate fd */
goto free_map;
@@ -124,19 +162,29 @@ free_map:
/* if error is returned, fd is released.
* On success caller should complete fd access with matching fdput()
*/
-struct bpf_map *bpf_map_get(struct fd f)
+struct bpf_map *__bpf_map_get(struct fd f)
{
- struct bpf_map *map;
-
if (!f.file)
return ERR_PTR(-EBADF);
-
if (f.file->f_op != &bpf_map_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- map = f.file->private_data;
+ return f.file->private_data;
+}
+
+struct bpf_map *bpf_map_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map))
+ return map;
+
+ atomic_inc(&map->refcnt);
+ fdput(f);
return map;
}
@@ -164,7 +212,7 @@ static int map_lookup_elem(union bpf_attr *attr)
return -EINVAL;
f = fdget(ufd);
- map = bpf_map_get(f);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -223,7 +271,7 @@ static int map_update_elem(union bpf_attr *attr)
return -EINVAL;
f = fdget(ufd);
- map = bpf_map_get(f);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -276,7 +324,7 @@ static int map_delete_elem(union bpf_attr *attr)
return -EINVAL;
f = fdget(ufd);
- map = bpf_map_get(f);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -317,7 +365,7 @@ static int map_get_next_key(union bpf_attr *attr)
return -EINVAL;
f = fdget(ufd);
- map = bpf_map_get(f);
+ map = __bpf_map_get(f);
if (IS_ERR(map))
return PTR_ERR(map);
@@ -402,6 +450,10 @@ static void fixup_bpf_calls(struct bpf_prog *prog)
*/
BUG_ON(!prog->aux->ops->get_func_proto);
+ if (insn->imm == BPF_FUNC_get_route_realm)
+ prog->dst_needed = 1;
+ if (insn->imm == BPF_FUNC_get_prandom_u32)
+ bpf_user_rnd_init_once();
if (insn->imm == BPF_FUNC_tail_call) {
/* mark bpf_tail_call as different opcode
* to avoid conditional branch in
@@ -436,29 +488,51 @@ static void free_used_maps(struct bpf_prog_aux *aux)
kfree(aux->used_maps);
}
-static void __prog_put_rcu(struct rcu_head *rcu)
+static int bpf_prog_charge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = get_current_user();
+ unsigned long memlock_limit;
+
+ memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+ atomic_long_add(prog->pages, &user->locked_vm);
+ if (atomic_long_read(&user->locked_vm) > memlock_limit) {
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+ return -EPERM;
+ }
+ prog->aux->user = user;
+ return 0;
+}
+
+static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
+{
+ struct user_struct *user = prog->aux->user;
+
+ atomic_long_sub(prog->pages, &user->locked_vm);
+ free_uid(user);
+}
+
+static void __prog_put_common(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
free_used_maps(aux);
+ bpf_prog_uncharge_memlock(aux->prog);
bpf_prog_free(aux->prog);
}
/* version of bpf_prog_put() that is called after a grace period */
void bpf_prog_put_rcu(struct bpf_prog *prog)
{
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
- prog->aux->prog = prog;
- call_rcu(&prog->aux->rcu, __prog_put_rcu);
- }
+ if (atomic_dec_and_test(&prog->aux->refcnt))
+ call_rcu(&prog->aux->rcu, __prog_put_common);
}
void bpf_prog_put(struct bpf_prog *prog)
{
- if (atomic_dec_and_test(&prog->aux->refcnt)) {
- free_used_maps(prog->aux);
- bpf_prog_free(prog);
- }
+ if (atomic_dec_and_test(&prog->aux->refcnt))
+ __prog_put_common(&prog->aux->rcu);
}
EXPORT_SYMBOL_GPL(bpf_prog_put);
@@ -474,21 +548,22 @@ static const struct file_operations bpf_prog_fops = {
.release = bpf_prog_release,
};
-static struct bpf_prog *get_prog(struct fd f)
+int bpf_prog_new_fd(struct bpf_prog *prog)
{
- struct bpf_prog *prog;
+ return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
+ O_RDWR | O_CLOEXEC);
+}
+static struct bpf_prog *__bpf_prog_get(struct fd f)
+{
if (!f.file)
return ERR_PTR(-EBADF);
-
if (f.file->f_op != &bpf_prog_fops) {
fdput(f);
return ERR_PTR(-EINVAL);
}
- prog = f.file->private_data;
-
- return prog;
+ return f.file->private_data;
}
/* called by sockets/tracing/seccomp before attaching program to an event
@@ -499,13 +574,13 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
struct fd f = fdget(ufd);
struct bpf_prog *prog;
- prog = get_prog(f);
-
+ prog = __bpf_prog_get(f);
if (IS_ERR(prog))
return prog;
atomic_inc(&prog->aux->refcnt);
fdput(f);
+
return prog;
}
EXPORT_SYMBOL_GPL(bpf_prog_get);
@@ -540,11 +615,18 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
/* plain bpf_prog allocation */
prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
if (!prog)
return -ENOMEM;
+ err = bpf_prog_charge_memlock(prog);
+ if (err)
+ goto free_prog_nouncharge;
+
prog->len = attr->insn_cnt;
err = -EFAULT;
@@ -553,10 +635,10 @@ static int bpf_prog_load(union bpf_attr *attr)
goto free_prog;
prog->orig_prog = NULL;
- prog->jited = false;
+ prog->jited = 0;
atomic_set(&prog->aux->refcnt, 1);
- prog->gpl_compatible = is_gpl;
+ prog->gpl_compatible = is_gpl ? 1 : 0;
/* find program type: socket_filter vs tracing_filter */
err = find_prog_type(type, prog);
@@ -576,7 +658,7 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+ err = bpf_prog_new_fd(prog);
if (err < 0)
/* failed to allocate fd */
goto free_used_maps;
@@ -586,20 +668,36 @@ static int bpf_prog_load(union bpf_attr *attr)
free_used_maps:
free_used_maps(prog->aux);
free_prog:
+ bpf_prog_uncharge_memlock(prog);
+free_prog_nouncharge:
bpf_prog_free(prog);
return err;
}
+#define BPF_OBJ_LAST_FIELD bpf_fd
+
+static int bpf_obj_pin(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_OBJ))
+ return -EINVAL;
+
+ return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname));
+}
+
+static int bpf_obj_get(const union bpf_attr *attr)
+{
+ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+ return -EINVAL;
+
+ return bpf_obj_get_user(u64_to_ptr(attr->pathname));
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
int err;
- /* the syscall is limited to root temporarily. This restriction will be
- * lifted when security audit is clean. Note that eBPF+tracing must have
- * this restriction, since it may pass kernel data to user space
- */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled)
return -EPERM;
if (!access_ok(VERIFY_READ, uattr, 1))
@@ -654,6 +752,12 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_LOAD:
err = bpf_prog_load(&attr);
break;
+ case BPF_OBJ_PIN:
+ err = bpf_obj_pin(&attr);
+ break;
+ case BPF_OBJ_GET:
+ err = bpf_obj_get(&attr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index b074b23000d6..c6073056badf 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -199,6 +199,7 @@ struct verifier_env {
struct verifier_state_list **explored_states; /* search pruning optimization */
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
u32 used_map_cnt; /* number of used maps */
+ bool allow_ptr_leaks;
};
/* verbose verifier prints what it's seeing
@@ -213,7 +214,7 @@ static DEFINE_MUTEX(bpf_verifier_lock);
* verbose() is used to dump the verification trace to the log, so the user
* can figure out what's wrong with the program
*/
-static void verbose(const char *fmt, ...)
+static __printf(1, 2) void verbose(const char *fmt, ...)
{
va_list args;
@@ -244,6 +245,7 @@ static const struct {
} func_limit[] = {
{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+ {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
};
static void print_verifier_state(struct verifier_env *env)
@@ -538,6 +540,21 @@ static int bpf_size_to_bytes(int bpf_size)
return -EINVAL;
}
+static bool is_spillable_regtype(enum bpf_reg_type type)
+{
+ switch (type) {
+ case PTR_TO_MAP_VALUE:
+ case PTR_TO_MAP_VALUE_OR_NULL:
+ case PTR_TO_STACK:
+ case PTR_TO_CTX:
+ case FRAME_PTR:
+ case CONST_PTR_TO_MAP:
+ return true;
+ default:
+ return false;
+ }
+}
+
/* check_stack_read/write functions track spill/fill of registers,
* stack boundary and alignment are checked in check_mem_access()
*/
@@ -550,9 +567,7 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
*/
if (value_regno >= 0 &&
- (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
- state->regs[value_regno].type == PTR_TO_STACK ||
- state->regs[value_regno].type == PTR_TO_CTX)) {
+ is_spillable_regtype(state->regs[value_regno].type)) {
/* register containing pointer is being spilled into stack */
if (size != BPF_REG_SIZE) {
@@ -643,6 +658,20 @@ static int check_ctx_access(struct verifier_env *env, int off, int size,
return -EACCES;
}
+static bool is_pointer_value(struct verifier_env *env, int regno)
+{
+ if (env->allow_ptr_leaks)
+ return false;
+
+ switch (env->cur_state.regs[regno].type) {
+ case UNKNOWN_VALUE:
+ case CONST_IMM:
+ return false;
+ default:
+ return true;
+ }
+}
+
/* check whether memory at (regno + off) is accessible for t = (read | write)
* if t==write, value_regno is a register which value is stored into memory
* if t==read, value_regno is a register which will receive the value from memory
@@ -669,11 +698,21 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
}
if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into map\n", value_regno);
+ return -EACCES;
+ }
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
} else if (state->regs[regno].type == PTR_TO_CTX) {
+ if (t == BPF_WRITE && value_regno >= 0 &&
+ is_pointer_value(env, value_regno)) {
+ verbose("R%d leaks addr into ctx\n", value_regno);
+ return -EACCES;
+ }
err = check_ctx_access(env, off, size, t);
if (!err && t == BPF_READ && value_regno >= 0)
mark_reg_unknown_value(state->regs, value_regno);
@@ -684,10 +723,17 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
- if (t == BPF_WRITE)
+ if (t == BPF_WRITE) {
+ if (!env->allow_ptr_leaks &&
+ state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose("attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
err = check_stack_write(state, off, size, value_regno);
- else
+ } else {
err = check_stack_read(state, off, size, value_regno);
+ }
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[state->regs[regno].type]);
@@ -775,8 +821,13 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
return -EACCES;
}
- if (arg_type == ARG_ANYTHING)
+ if (arg_type == ARG_ANYTHING) {
+ if (is_pointer_value(env, regno)) {
+ verbose("R%d leaks addr into helper function\n", regno);
+ return -EACCES;
+ }
return 0;
+ }
if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
arg_type == ARG_PTR_TO_MAP_VALUE) {
@@ -860,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
* don't allow any other map type to be passed into
* the special func;
*/
- if (bool_map != bool_func)
+ if (bool_func && bool_map != bool_func)
return -EINVAL;
}
@@ -950,8 +1001,9 @@ static int check_call(struct verifier_env *env, int func_id)
}
/* check validity of 32-bit and 64-bit arithmetic operations */
-static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn)
{
+ struct reg_state *regs = env->cur_state.regs;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -976,6 +1028,12 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
+
/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
if (err)
@@ -1012,6 +1070,11 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
*/
regs[insn->dst_reg] = regs[insn->src_reg];
} else {
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d partial copy of pointer\n",
+ insn->src_reg);
+ return -EACCES;
+ }
regs[insn->dst_reg].type = UNKNOWN_VALUE;
regs[insn->dst_reg].map_ptr = NULL;
}
@@ -1061,8 +1124,18 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
/* pattern match 'bpf_add Rx, imm' instruction */
if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
regs[insn->dst_reg].type == FRAME_PTR &&
- BPF_SRC(insn->code) == BPF_K)
+ BPF_SRC(insn->code) == BPF_K) {
stack_relative = true;
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->dst_reg);
+ return -EACCES;
+ } else if (BPF_SRC(insn->code) == BPF_X &&
+ is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer arithmetic prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
/* check dest operand */
err = check_reg_arg(regs, insn->dst_reg, DST_OP);
@@ -1101,6 +1174,12 @@ static int check_cond_jmp_op(struct verifier_env *env,
err = check_reg_arg(regs, insn->src_reg, SRC_OP);
if (err)
return err;
+
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d pointer comparison prohibited\n",
+ insn->src_reg);
+ return -EACCES;
+ }
} else {
if (insn->src_reg != BPF_REG_0) {
verbose("BPF_JMP uses reserved fields\n");
@@ -1155,6 +1234,9 @@ static int check_cond_jmp_op(struct verifier_env *env,
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = 0;
}
+ } else if (is_pointer_value(env, insn->dst_reg)) {
+ verbose("R%d pointer comparison prohibited\n", insn->dst_reg);
+ return -EACCES;
} else if (BPF_SRC(insn->code) == BPF_K &&
(opcode == BPF_JEQ || opcode == BPF_JNE)) {
@@ -1658,7 +1740,7 @@ static int do_check(struct verifier_env *env)
}
if (class == BPF_ALU || class == BPF_ALU64) {
- err = check_alu_op(regs, insn);
+ err = check_alu_op(env, insn);
if (err)
return err;
@@ -1816,6 +1898,11 @@ static int do_check(struct verifier_env *env)
if (err)
return err;
+ if (is_pointer_value(env, BPF_REG_0)) {
+ verbose("R0 leaks addr as return value\n");
+ return -EACCES;
+ }
+
process_bpf_exit:
insn_idx = pop_stack(env, &prev_insn_idx);
if (insn_idx < 0) {
@@ -1902,8 +1989,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env)
}
f = fdget(insn->imm);
-
- map = bpf_map_get(f);
+ map = __bpf_map_get(f);
if (IS_ERR(map)) {
verbose("fd %d is not pointing to valid bpf_map\n",
insn->imm);
@@ -2024,7 +2110,7 @@ static int convert_ctx_accesses(struct verifier_env *env)
cnt = env->prog->aux->ops->
convert_ctx_access(type, insn->dst_reg, insn->src_reg,
- insn->off, insn_buf);
+ insn->off, insn_buf, env->prog);
if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
@@ -2144,6 +2230,8 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
if (ret < 0)
goto skip_full_check;
+ env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
+
ret = do_check(env);
skip_full_check:
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2c9eae6ad970..f1603c153890 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,7 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
-#include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
#include <linux/string.h>
#include <linux/sort.h>
#include <linux/kmod.h>
@@ -75,7 +75,7 @@
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * css_set_lock protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
*
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -83,12 +83,12 @@
*/
#ifdef CONFIG_PROVE_RCU
DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
+DEFINE_SPINLOCK(css_set_lock);
EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
+EXPORT_SYMBOL_GPL(css_set_lock);
#else
static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
+static DEFINE_SPINLOCK(css_set_lock);
#endif
/*
@@ -103,6 +103,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
*/
static DEFINE_SPINLOCK(release_agent_path_lock);
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
#define cgroup_assert_mutex_or_rcu_locked() \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
!lockdep_is_held(&cgroup_mutex), \
@@ -136,6 +138,27 @@ static const char *cgroup_subsys_name[] = {
};
#undef SUBSYS
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x) \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key); \
+ DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key); \
+ EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
/*
* The default hierarchy, reserved for the subsystems that are otherwise
* unattached - it never has more than a single cgroup, and all tasks are
@@ -150,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
*/
static bool cgrp_dfl_root_visible;
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
-
/* some controllers are not supported in the default hierarchy */
static unsigned long cgrp_dfl_root_inhibit_ss_mask;
@@ -183,6 +200,7 @@ static u64 css_serial_nr_next = 1;
*/
static unsigned long have_fork_callback __read_mostly;
static unsigned long have_exit_callback __read_mostly;
+static unsigned long have_free_callback __read_mostly;
/* Ditto for the can_fork callback. */
static unsigned long have_canfork_callback __read_mostly;
@@ -192,14 +210,87 @@ static struct cftype cgroup_legacy_base_files[];
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
static int cgroup_destroy_locked(struct cgroup *cgrp);
static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core. This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+ return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ * and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed. Everything should be at process granularity. Use
+ * "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted. pids will be unique unless they got
+ * recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed. Replacement
+ * notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
+ * and its descendants contain no task; otherwise, 1. The file also
+ * generates kernfs notification which can be monitored through poll and
+ * [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ * take masks of ancestors with non-empty cpus/mems, instead of being
+ * moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ * masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ * is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+ return cgrp->root == &cgrp_dfl_root;
+}
+
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
gfp_t gfp_mask)
@@ -208,7 +299,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
idr_preload(gfp_mask);
spin_lock_bh(&cgroup_idr_lock);
- ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
+ ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM);
spin_unlock_bh(&cgroup_idr_lock);
idr_preload_end();
return ret;
@@ -332,6 +423,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
return !(cgrp->self.flags & CSS_ONLINE);
}
+static void cgroup_get(struct cgroup *cgrp)
+{
+ WARN_ON_ONCE(cgroup_is_dead(cgrp));
+ css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+ return css_tryget(&cgrp->self);
+}
+
+static void cgroup_put(struct cgroup *cgrp)
+{
+ css_put(&cgrp->self);
+}
+
struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
{
struct cgroup *cgrp = of->kn->parent->priv;
@@ -481,19 +588,31 @@ struct css_set init_css_set = {
.mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks),
.mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
+ .task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
};
static int css_set_count = 1; /* 1 for init_css_set */
/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
+/**
* cgroup_update_populated - updated populated count of a cgroup
* @cgrp: the target cgroup
* @populated: inc or dec populated count
*
- * @cgrp is either getting the first task (css_set) or losing the last.
- * Update @cgrp->populated_cnt accordingly. The count is propagated
- * towards root so that a given cgroup's populated_cnt is zero iff the
- * cgroup and all its descendants are empty.
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last. Update @cgrp->populated_cnt accordingly. The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
*
* @cgrp's interface file "cgroup.populated" is zero if
* @cgrp->populated_cnt is zero and 1 otherwise. When @cgrp->populated_cnt
@@ -503,7 +622,7 @@ static int css_set_count = 1; /* 1 for init_css_set */
*/
static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
{
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
do {
bool trigger;
@@ -516,12 +635,93 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
if (!trigger)
break;
- if (cgrp->populated_kn)
- kernfs_notify(cgrp->populated_kn);
+ check_for_release(cgrp);
+ cgroup_file_notify(&cgrp->events_file);
+
cgrp = cgroup_parent(cgrp);
} while (cgrp);
}
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last. Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+ struct cgrp_cset_link *link;
+
+ lockdep_assert_held(&css_set_lock);
+
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+ cgroup_update_populated(link->cgrp, populated);
+}
+
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset. If @task didn't belong to any
+ * css_set, @from_cset can be NULL. If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+ struct css_set *from_cset, struct css_set *to_cset,
+ bool use_mg_tasks)
+{
+ lockdep_assert_held(&css_set_lock);
+
+ if (from_cset) {
+ struct css_task_iter *it, *pos;
+
+ WARN_ON_ONCE(list_empty(&task->cg_list));
+
+ /*
+ * @task is leaving, advance task iterators which are
+ * pointing to it so that they can resume at the next
+ * position. Advancing an iterator might remove it from
+ * the list, use safe walk. See css_task_iter_advance*()
+ * for details.
+ */
+ list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+ iters_node)
+ if (it->task_pos == &task->cg_list)
+ css_task_iter_advance(it);
+
+ list_del_init(&task->cg_list);
+ if (!css_set_populated(from_cset))
+ css_set_update_populated(from_cset, false);
+ } else {
+ WARN_ON_ONCE(!list_empty(&task->cg_list));
+ }
+
+ if (to_cset) {
+ /*
+ * We are synchronized through cgroup_threadgroup_rwsem
+ * against PF_EXITING setting such that we can't race
+ * against cgroup_exit() changing the css_set to
+ * init_css_set and dropping the old one.
+ */
+ WARN_ON_ONCE(task->flags & PF_EXITING);
+
+ if (!css_set_populated(to_cset))
+ css_set_update_populated(to_cset, true);
+ rcu_assign_pointer(task->cgroups, to_cset);
+ list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+ &to_cset->tasks);
+ }
+}
+
/*
* hash table for cgroup groups. This improves the performance to find
* an existing css_set. This hash doesn't (currently) take into
@@ -549,7 +749,7 @@ static void put_css_set_locked(struct css_set *cset)
struct cgroup_subsys *ss;
int ssid;
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
if (!atomic_dec_and_test(&cset->refcount))
return;
@@ -561,17 +761,10 @@ static void put_css_set_locked(struct css_set *cset)
css_set_count--;
list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *cgrp = link->cgrp;
-
list_del(&link->cset_link);
list_del(&link->cgrp_link);
-
- /* @cgrp can't go away while we're holding css_set_rwsem */
- if (list_empty(&cgrp->cset_links)) {
- cgroup_update_populated(cgrp, false);
- check_for_release(cgrp);
- }
-
+ if (cgroup_parent(link->cgrp))
+ cgroup_put(link->cgrp);
kfree(link);
}
@@ -588,9 +781,9 @@ static void put_css_set(struct css_set *cset)
if (atomic_add_unless(&cset->refcount, -1, 1))
return;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
put_css_set_locked(cset);
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
/*
@@ -779,15 +972,15 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
link->cset = cset;
link->cgrp = cgrp;
- if (list_empty(&cgrp->cset_links))
- cgroup_update_populated(cgrp, true);
- list_move(&link->cset_link, &cgrp->cset_links);
-
/*
- * Always add links to the tail of the list so that the list
- * is sorted by order of hierarchy creation
+ * Always add links to the tail of the lists so that the lists are
+ * in choronological order.
*/
+ list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+ if (cgroup_parent(cgrp))
+ cgroup_get(cgrp);
}
/**
@@ -813,11 +1006,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
/* First see if we already have a cgroup group that matches
* the desired set */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
if (cset)
return cset;
@@ -838,13 +1031,14 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->mg_tasks);
INIT_LIST_HEAD(&cset->mg_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
+ INIT_LIST_HEAD(&cset->task_iters);
INIT_HLIST_NODE(&cset->hlist);
/* Copy the set of subsystem state objects generated in
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -866,53 +1060,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
list_add_tail(&cset->e_cset_node[ssid],
&cset->subsys[ssid]->cgroup->e_csets[ssid]);
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
return cset;
}
-void cgroup_threadgroup_change_begin(struct task_struct *tsk)
-{
- down_read(&tsk->signal->group_rwsem);
-}
-
-void cgroup_threadgroup_change_end(struct task_struct *tsk)
-{
- up_read(&tsk->signal->group_rwsem);
-}
-
-/**
- * threadgroup_lock - lock threadgroup
- * @tsk: member task of the threadgroup to lock
- *
- * Lock the threadgroup @tsk belongs to. No new task is allowed to enter
- * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * change ->group_leader/pid. This is useful for cases where the threadgroup
- * needs to stay stable across blockable operations.
- *
- * fork and exit explicitly call threadgroup_change_{begin|end}() for
- * synchronization. While held, no new task will be added to threadgroup
- * and no existing live task will have its PF_EXITING set.
- *
- * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- * sub-thread becomes a new leader.
- */
-static void threadgroup_lock(struct task_struct *tsk)
-{
- down_write(&tsk->signal->group_rwsem);
-}
-
-/**
- * threadgroup_unlock - unlock threadgroup
- * @tsk: member task of the threadgroup to unlock
- *
- * Reverse threadgroup_lock().
- */
-static inline void threadgroup_unlock(struct task_struct *tsk)
-{
- up_write(&tsk->signal->group_rwsem);
-}
-
static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -972,14 +1124,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
list_del(&link->cgrp_link);
kfree(link);
}
- up_write(&css_set_rwsem);
+
+ spin_unlock_bh(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
@@ -1001,7 +1154,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
struct cgroup *res = NULL;
lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
if (cset == &init_css_set) {
res = &root->cgrp;
@@ -1024,7 +1177,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
/*
* Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
+ * called with cgroup_mutex and css_set_lock held.
*/
static struct cgroup *task_cgroup_from_root(struct task_struct *task,
struct cgroup_root *root)
@@ -1063,7 +1216,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* update of a tasks cgroup pointer by cgroup_attach_task()
*/
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
static const struct file_operations proc_cgroupstats_operations;
@@ -1086,43 +1238,25 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
* cgroup_file_mode - deduce file mode of a control file
* @cft: the control file in question
*
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
+ * S_IRUGO for read, S_IWUSR for write.
*/
static umode_t cgroup_file_mode(const struct cftype *cft)
{
umode_t mode = 0;
- if (cft->mode)
- return cft->mode;
-
if (cft->read_u64 || cft->read_s64 || cft->seq_show)
mode |= S_IRUGO;
- if (cft->write_u64 || cft->write_s64 || cft->write)
- mode |= S_IWUSR;
+ if (cft->write_u64 || cft->write_s64 || cft->write) {
+ if (cft->flags & CFTYPE_WORLD_WRITABLE)
+ mode |= S_IWUGO;
+ else
+ mode |= S_IWUSR;
+ }
return mode;
}
-static void cgroup_get(struct cgroup *cgrp)
-{
- WARN_ON_ONCE(cgroup_is_dead(cgrp));
- css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
- return css_tryget(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
- css_put(&cgrp->self);
-}
-
/**
* cgroup_calc_child_subsys_mask - calculate child_subsys_mask
* @cgrp: the target cgroup
@@ -1263,28 +1397,64 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
}
/**
- * cgroup_clear_dir - remove subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be removed
+ * css_clear_dir - remove subsys files in a cgroup directory
+ * @css: taget css
+ * @cgrp_override: specify if target cgroup is different from css->cgroup
*/
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void css_clear_dir(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp_override)
{
- struct cgroup_subsys *ss;
- int i;
+ struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cftype *cfts;
- for_each_subsys(ss, i) {
- struct cftype *cfts;
+ list_for_each_entry(cfts, &css->ss->cfts, node)
+ cgroup_addrm_files(css, cgrp, cfts, false);
+}
- if (!(subsys_mask & (1 << i)))
- continue;
- list_for_each_entry(cfts, &ss->cfts, node)
- cgroup_addrm_files(cgrp, cfts, false);
+/**
+ * css_populate_dir - create subsys files in a cgroup directory
+ * @css: target css
+ * @cgrp_overried: specify if target cgroup is different from css->cgroup
+ *
+ * On failure, no file is added.
+ */
+static int css_populate_dir(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp_override)
+{
+ struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+ struct cftype *cfts, *failed_cfts;
+ int ret;
+
+ if (!css->ss) {
+ if (cgroup_on_dfl(cgrp))
+ cfts = cgroup_dfl_base_files;
+ else
+ cfts = cgroup_legacy_base_files;
+
+ return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+ }
+
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ ret = cgroup_addrm_files(css, cgrp, cfts, true);
+ if (ret < 0) {
+ failed_cfts = cfts;
+ goto err;
+ }
}
+ return 0;
+err:
+ list_for_each_entry(cfts, &css->ss->cfts, node) {
+ if (cfts == failed_cfts)
+ break;
+ cgroup_addrm_files(css, cgrp, cfts, false);
+ }
+ return ret;
}
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned long ss_mask)
{
+ struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
unsigned long tmp_ss_mask;
int ssid, i, ret;
@@ -1306,10 +1476,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
if (dst_root == &cgrp_dfl_root)
tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
- ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
- if (ret) {
- if (dst_root != &cgrp_dfl_root)
- return ret;
+ for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+ struct cgroup *scgrp = &ss->root->cgrp;
+ int tssid;
+
+ ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
+ if (!ret)
+ continue;
/*
* Rebinding back to the default root is not allowed to
@@ -1317,57 +1490,67 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
* be rare. Moving subsystems back and forth even more so.
* Just warn about it and continue.
*/
- if (cgrp_dfl_root_visible) {
- pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
- ret, ss_mask);
- pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ if (dst_root == &cgrp_dfl_root) {
+ if (cgrp_dfl_root_visible) {
+ pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+ ret, ss_mask);
+ pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+ }
+ continue;
+ }
+
+ for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
+ if (tssid == ssid)
+ break;
+ css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
}
+ return ret;
}
/*
* Nothing can fail from this point on. Remove files for the
* removed subsystems and rebind each subsystem.
*/
- for_each_subsys_which(ss, ssid, &ss_mask)
- cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
-
for_each_subsys_which(ss, ssid, &ss_mask) {
- struct cgroup_root *src_root;
- struct cgroup_subsys_state *css;
+ struct cgroup_root *src_root = ss->root;
+ struct cgroup *scgrp = &src_root->cgrp;
+ struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
struct css_set *cset;
- src_root = ss->root;
- css = cgroup_css(&src_root->cgrp, ss);
+ WARN_ON(!css || cgroup_css(dcgrp, ss));
- WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
+ css_clear_dir(css, NULL);
- RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
- rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
+ rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = &dst_root->cgrp;
+ css->cgroup = dcgrp;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
- &dst_root->cgrp.e_csets[ss->id]);
- up_write(&css_set_rwsem);
+ &dcgrp->e_csets[ss->id]);
+ spin_unlock_bh(&css_set_lock);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.subtree_control &= ~(1 << ssid);
- cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+ scgrp->subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(scgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root) {
- dst_root->cgrp.subtree_control |= 1 << ssid;
- cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ if (dst_root == &cgrp_dfl_root) {
+ static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
+ } else {
+ dcgrp->subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(dcgrp);
+ static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
}
if (ss->bind)
ss->bind(css);
}
- kernfs_activate(dst_root->cgrp.kn);
+ kernfs_activate(dcgrp->kn);
return 0;
}
@@ -1497,7 +1680,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
for_each_subsys(ss, i) {
if (strcmp(token, ss->legacy_name))
continue;
- if (ss->disabled)
+ if (!cgroup_ssid_enabled(i))
continue;
/* Mutually exclusive option 'all' + subsystem name */
@@ -1528,7 +1711,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
*/
if (all_ss || (!one_ss && !opts->none && !opts->name))
for_each_subsys(ss, i)
- if (!ss->disabled)
+ if (cgroup_ssid_enabled(i))
opts->subsys_mask |= (1 << i);
/*
@@ -1624,7 +1807,7 @@ static void cgroup_enable_task_cg_lists(void)
{
struct task_struct *p, *g;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
if (use_task_css_set_links)
goto out_unlock;
@@ -1654,14 +1837,16 @@ static void cgroup_enable_task_cg_lists(void)
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
- list_add(&p->cg_list, &cset->tasks);
+ if (!css_set_populated(cset))
+ css_set_update_populated(cset, true);
+ list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
spin_unlock_irq(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1671,6 +1856,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
+ INIT_LIST_HEAD(&cgrp->self.files);
INIT_LIST_HEAD(&cgrp->cset_links);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
@@ -1708,7 +1894,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
- struct cftype *base_files;
struct css_set *cset;
int i, ret;
@@ -1725,7 +1910,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
goto out;
/*
- * We're accessing css_set_count without locking css_set_rwsem here,
+ * We're accessing css_set_count without locking css_set_lock here,
* but that's OK - it can only be increased by someone holding
* cgroup_lock, and that's us. The worst that can happen is that we
* have some link structures left over
@@ -1747,12 +1932,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- if (root == &cgrp_dfl_root)
- base_files = cgroup_dfl_base_files;
- else
- base_files = cgroup_legacy_base_files;
-
- ret = cgroup_addrm_files(root_cgrp, base_files, true);
+ ret = css_populate_dir(&root_cgrp->self, NULL);
if (ret)
goto destroy_root;
@@ -1772,10 +1952,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
- down_write(&css_set_rwsem);
- hash_for_each(css_set_table, i, cset, hlist)
+ spin_lock_bh(&css_set_lock);
+ hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
- up_write(&css_set_rwsem);
+ if (css_set_populated(cset))
+ cgroup_update_populated(root_cgrp, true);
+ }
+ spin_unlock_bh(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2008,7 +2191,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
char *path = NULL;
mutex_lock(&cgroup_mutex);
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2021,7 +2204,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
path = buf;
}
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return path;
}
@@ -2049,6 +2232,49 @@ struct cgroup_taskset {
struct task_struct *cur_task;
};
+#define CGROUP_TASKSET_INIT(tset) (struct cgroup_taskset){ \
+ .src_csets = LIST_HEAD_INIT(tset.src_csets), \
+ .dst_csets = LIST_HEAD_INIT(tset.dst_csets), \
+ .csets = &tset.src_csets, \
+}
+
+/**
+ * cgroup_taskset_add - try to add a migration target task to a taskset
+ * @task: target task
+ * @tset: target taskset
+ *
+ * Add @task, which is a migration target, to @tset. This function becomes
+ * noop if @task doesn't need to be migrated. @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_taskset_add(struct task_struct *task,
+ struct cgroup_taskset *tset)
+{
+ struct css_set *cset;
+
+ lockdep_assert_held(&css_set_lock);
+
+ /* @task either already exited or can't exit until the end */
+ if (task->flags & PF_EXITING)
+ return;
+
+ /* leave @task alone if post_fork() hasn't linked it yet */
+ if (list_empty(&task->cg_list))
+ return;
+
+ cset = task_css_set(task);
+ if (!cset->mg_src_cgrp)
+ return;
+
+ list_move_tail(&task->cg_list, &cset->mg_tasks);
+ if (list_empty(&cset->mg_node))
+ list_add_tail(&cset->mg_node, &tset->src_csets);
+ if (list_empty(&cset->mg_dst_cset->mg_node))
+ list_move_tail(&cset->mg_dst_cset->mg_node,
+ &tset->dst_csets);
+}
+
/**
* cgroup_taskset_first - reset taskset and return the first task
* @tset: taskset of interest
@@ -2096,47 +2322,86 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
}
/**
- * cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp: the cgroup @tsk is being migrated from
- * @tsk: the task being migrated
- * @new_cset: the new css_set @tsk is being attached to
+ * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * @tset: taget taskset
+ * @dst_cgrp: destination cgroup
*
- * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
+ * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the
+ * ->can_attach callbacks fails and guarantees that either all or none of
+ * the tasks in @tset are migrated. @tset is consumed regardless of
+ * success.
*/
-static void cgroup_task_migrate(struct cgroup *old_cgrp,
- struct task_struct *tsk,
- struct css_set *new_cset)
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+ struct cgroup *dst_cgrp)
{
- struct css_set *old_cset;
-
- lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
+ struct cgroup_subsys_state *css, *failed_css = NULL;
+ struct task_struct *task, *tmp_task;
+ struct css_set *cset, *tmp_cset;
+ int i, ret;
- /*
- * We are synchronized through threadgroup_lock() against PF_EXITING
- * setting such that we can't race against cgroup_exit() changing the
- * css_set to init_css_set and dropping the old one.
- */
- WARN_ON_ONCE(tsk->flags & PF_EXITING);
- old_cset = task_css_set(tsk);
+ /* methods shouldn't be called if no task is actually migrating */
+ if (list_empty(&tset->src_csets))
+ return 0;
- get_css_set(new_cset);
- rcu_assign_pointer(tsk->cgroups, new_cset);
+ /* check that we can legitimately attach to the cgroup */
+ for_each_e_css(css, i, dst_cgrp) {
+ if (css->ss->can_attach) {
+ ret = css->ss->can_attach(css, tset);
+ if (ret) {
+ failed_css = css;
+ goto out_cancel_attach;
+ }
+ }
+ }
/*
- * Use move_tail so that cgroup_taskset_first() still returns the
- * leader after migration. This works because cgroup_migrate()
- * ensures that the dst_cset of the leader is the first on the
- * tset's dst_csets list.
+ * Now that we're guaranteed success, proceed to move all tasks to
+ * the new cgroup. There are no failure cases after here, so this
+ * is the commit point.
*/
- list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
+ spin_lock_bh(&css_set_lock);
+ list_for_each_entry(cset, &tset->src_csets, mg_node) {
+ list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+ struct css_set *from_cset = task_css_set(task);
+ struct css_set *to_cset = cset->mg_dst_cset;
+
+ get_css_set(to_cset);
+ css_set_move_task(task, from_cset, to_cset, true);
+ put_css_set_locked(from_cset);
+ }
+ }
+ spin_unlock_bh(&css_set_lock);
/*
- * We just gained a reference on old_cset by taking it from the
- * task. As trading it for new_cset is protected by cgroup_mutex,
- * we're safe to drop it here; it will be freed under RCU.
+ * Migration is committed, all target tasks are now on dst_csets.
+ * Nothing is sensitive to fork() after this point. Notify
+ * controllers that migration is complete.
*/
- put_css_set_locked(old_cset);
+ tset->csets = &tset->dst_csets;
+
+ for_each_e_css(css, i, dst_cgrp)
+ if (css->ss->attach)
+ css->ss->attach(css, tset);
+
+ ret = 0;
+ goto out_release_tset;
+
+out_cancel_attach:
+ for_each_e_css(css, i, dst_cgrp) {
+ if (css == failed_css)
+ break;
+ if (css->ss->cancel_attach)
+ css->ss->cancel_attach(css, tset);
+ }
+out_release_tset:
+ spin_lock_bh(&css_set_lock);
+ list_splice_init(&tset->dst_csets, &tset->src_csets);
+ list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+ list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+ list_del_init(&cset->mg_node);
+ }
+ spin_unlock_bh(&css_set_lock);
+ return ret;
}
/**
@@ -2152,14 +2417,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
lockdep_assert_held(&cgroup_mutex);
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
/**
@@ -2172,10 +2437,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
* @src_cset and add it to @preloaded_csets, which should later be cleaned
* up by cgroup_migrate_finish().
*
- * This function may be called without holding threadgroup_lock even if the
- * target is a process. Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process. Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
*/
static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *dst_cgrp,
@@ -2184,7 +2450,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
struct cgroup *src_cgrp;
lockdep_assert_held(&cgroup_mutex);
- lockdep_assert_held(&css_set_rwsem);
+ lockdep_assert_held(&css_set_lock);
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
@@ -2273,12 +2539,12 @@ err:
/**
* cgroup_migrate - migrate a process or task to a cgroup
- * @cgrp: the destination cgroup
* @leader: the leader of the process or the task to migrate
* @threadgroup: whether @leader points to the whole process or a single task
+ * @cgrp: the destination cgroup
*
* Migrate a process or task denoted by @leader to @cgrp. If migrating a
- * process, the caller must be holding threadgroup_lock of @leader. The
+ * process, the caller must be holding cgroup_threadgroup_rwsem. The
* caller is also responsible for invoking cgroup_migrate_add_src() and
* cgroup_migrate_prepare_dst() on the targets before invoking this
* function and following up with cgroup_migrate_finish().
@@ -2289,115 +2555,29 @@ err:
* decided for all targets by invoking group_migrate_prepare_dst() before
* actually starting migrating.
*/
-static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
- bool threadgroup)
-{
- struct cgroup_taskset tset = {
- .src_csets = LIST_HEAD_INIT(tset.src_csets),
- .dst_csets = LIST_HEAD_INIT(tset.dst_csets),
- .csets = &tset.src_csets,
- };
- struct cgroup_subsys_state *css, *failed_css = NULL;
- struct css_set *cset, *tmp_cset;
- struct task_struct *task, *tmp_task;
- int i, ret;
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+ struct cgroup *cgrp)
+{
+ struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+ struct task_struct *task;
/*
* Prevent freeing of tasks while we take a snapshot. Tasks that are
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
rcu_read_lock();
task = leader;
do {
- /* @task either already exited or can't exit until the end */
- if (task->flags & PF_EXITING)
- goto next;
-
- /* leave @task alone if post_fork() hasn't linked it yet */
- if (list_empty(&task->cg_list))
- goto next;
-
- cset = task_css_set(task);
- if (!cset->mg_src_cgrp)
- goto next;
-
- /*
- * cgroup_taskset_first() must always return the leader.
- * Take care to avoid disturbing the ordering.
- */
- list_move_tail(&task->cg_list, &cset->mg_tasks);
- if (list_empty(&cset->mg_node))
- list_add_tail(&cset->mg_node, &tset.src_csets);
- if (list_empty(&cset->mg_dst_cset->mg_node))
- list_move_tail(&cset->mg_dst_cset->mg_node,
- &tset.dst_csets);
- next:
+ cgroup_taskset_add(task, &tset);
if (!threadgroup)
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- up_write(&css_set_rwsem);
-
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset.src_csets))
- return 0;
-
- /* check that we can legitimately attach to the cgroup */
- for_each_e_css(css, i, cgrp) {
- if (css->ss->can_attach) {
- ret = css->ss->can_attach(css, &tset);
- if (ret) {
- failed_css = css;
- goto out_cancel_attach;
- }
- }
- }
-
- /*
- * Now that we're guaranteed success, proceed to move all tasks to
- * the new cgroup. There are no failure cases after here, so this
- * is the commit point.
- */
- down_write(&css_set_rwsem);
- list_for_each_entry(cset, &tset.src_csets, mg_node) {
- list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
- cgroup_task_migrate(cset->mg_src_cgrp, task,
- cset->mg_dst_cset);
- }
- up_write(&css_set_rwsem);
-
- /*
- * Migration is committed, all target tasks are now on dst_csets.
- * Nothing is sensitive to fork() after this point. Notify
- * controllers that migration is complete.
- */
- tset.csets = &tset.dst_csets;
-
- for_each_e_css(css, i, cgrp)
- if (css->ss->attach)
- css->ss->attach(css, &tset);
-
- ret = 0;
- goto out_release_tset;
+ spin_unlock_bh(&css_set_lock);
-out_cancel_attach:
- for_each_e_css(css, i, cgrp) {
- if (css == failed_css)
- break;
- if (css->ss->cancel_attach)
- css->ss->cancel_attach(css, &tset);
- }
-out_release_tset:
- down_write(&css_set_rwsem);
- list_splice_init(&tset.dst_csets, &tset.src_csets);
- list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
- list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
- list_del_init(&cset->mg_node);
- }
- up_write(&css_set_rwsem);
- return ret;
+ return cgroup_taskset_migrate(&tset, cgrp);
}
/**
@@ -2406,7 +2586,7 @@ out_release_tset:
* @leader: the task or the leader of the threadgroup to be attached
* @threadgroup: attach the whole threadgroup?
*
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
*/
static int cgroup_attach_task(struct cgroup *dst_cgrp,
struct task_struct *leader, bool threadgroup)
@@ -2416,7 +2596,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
int ret;
/* look up all src csets */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
rcu_read_lock();
task = leader;
do {
@@ -2426,12 +2606,12 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
if (!ret)
- ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+ ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
cgroup_migrate_finish(&preloaded_csets);
return ret;
@@ -2459,15 +2639,15 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *cgrp;
struct inode *inode;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
while (!cgroup_is_descendant(dst_cgrp, cgrp))
cgrp = cgroup_parent(cgrp);
ret = -ENOMEM;
- inode = kernfs_get_inode(sb, cgrp->procs_kn);
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
if (inode) {
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
@@ -2498,14 +2678,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
if (!cgrp)
return -ENODEV;
-retry_find_task:
+ percpu_down_write(&cgroup_threadgroup_rwsem);
rcu_read_lock();
if (pid) {
tsk = find_task_by_vpid(pid);
if (!tsk) {
- rcu_read_unlock();
ret = -ESRCH;
- goto out_unlock_cgroup;
+ goto out_unlock_rcu;
}
} else {
tsk = current;
@@ -2521,37 +2700,23 @@ retry_find_task:
*/
if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
- rcu_read_unlock();
- goto out_unlock_cgroup;
+ goto out_unlock_rcu;
}
get_task_struct(tsk);
rcu_read_unlock();
- threadgroup_lock(tsk);
- if (threadgroup) {
- if (!thread_group_leader(tsk)) {
- /*
- * a race with de_thread from another thread's exec()
- * may strip us of our leadership, if this happens,
- * there is no choice but to throw this task away and
- * try again; this is
- * "double-double-toil-and-trouble-check locking".
- */
- threadgroup_unlock(tsk);
- put_task_struct(tsk);
- goto retry_find_task;
- }
- }
-
ret = cgroup_procs_write_permission(tsk, cgrp, of);
if (!ret)
ret = cgroup_attach_task(cgrp, tsk, threadgroup);
- threadgroup_unlock(tsk);
-
put_task_struct(tsk);
-out_unlock_cgroup:
+ goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+ rcu_read_unlock();
+out_unlock_threadgroup:
+ percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
}
@@ -2573,9 +2738,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (root == &cgrp_dfl_root)
continue;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
@@ -2690,14 +2855,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
static int cgroup_update_dfl_csses(struct cgroup *cgrp)
{
LIST_HEAD(preloaded_csets);
+ struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
struct cgroup_subsys_state *css;
struct css_set *src_cset;
int ret;
lockdep_assert_held(&cgroup_mutex);
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+
/* look up all csses currently attached to @cgrp's subtree */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
struct cgrp_cset_link *link;
@@ -2709,68 +2877,31 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
cgroup_migrate_add_src(link->cset, cgrp,
&preloaded_csets);
}
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
if (ret)
goto out_finish;
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
- struct task_struct *last_task = NULL, *task;
+ struct task_struct *task, *ntask;
/* src_csets precede dst_csets, break on the first dst_cset */
if (!src_cset->mg_src_cgrp)
break;
- /*
- * All tasks in src_cset need to be migrated to the
- * matching dst_cset. Empty it process by process. We
- * walk tasks but migrate processes. The leader might even
- * belong to a different cset but such src_cset would also
- * be among the target src_csets because the default
- * hierarchy enforces per-process membership.
- */
- while (true) {
- down_read(&css_set_rwsem);
- task = list_first_entry_or_null(&src_cset->tasks,
- struct task_struct, cg_list);
- if (task) {
- task = task->group_leader;
- WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
- get_task_struct(task);
- }
- up_read(&css_set_rwsem);
-
- if (!task)
- break;
-
- /* guard against possible infinite loop */
- if (WARN(last_task == task,
- "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
- goto out_finish;
- last_task = task;
-
- threadgroup_lock(task);
- /* raced against de_thread() from another thread? */
- if (!thread_group_leader(task)) {
- threadgroup_unlock(task);
- put_task_struct(task);
- continue;
- }
-
- ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
-
- threadgroup_unlock(task);
- put_task_struct(task);
-
- if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
- goto out_finish;
- }
+ /* all tasks in src_csets need to be migrated */
+ list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+ cgroup_taskset_add(task, &tset);
}
+ spin_unlock_bh(&css_set_lock);
+ ret = cgroup_taskset_migrate(&tset, cgrp);
out_finish:
cgroup_migrate_finish(&preloaded_csets);
+ percpu_up_write(&cgroup_threadgroup_rwsem);
return ret;
}
@@ -2797,7 +2928,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
if (tok[0] == '\0')
continue;
for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
- if (ss->disabled || strcmp(tok + 1, ss->name))
+ if (!cgroup_ssid_enabled(ssid) ||
+ strcmp(tok + 1, ss->name))
continue;
if (*tok == '+') {
@@ -2921,7 +3053,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
ret = create_css(child, ss,
cgrp->subtree_control & (1 << ssid));
else
- ret = cgroup_populate_dir(child, 1 << ssid);
+ ret = css_populate_dir(cgroup_css(child, ss),
+ NULL);
if (ret)
goto err_undo_css;
}
@@ -2954,7 +3087,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
if (css_disable & (1 << ssid)) {
kill_css(css);
} else {
- cgroup_clear_dir(child, 1 << ssid);
+ css_clear_dir(css, NULL);
if (ss->css_reset)
ss->css_reset(css);
}
@@ -3002,15 +3135,16 @@ err_undo_css:
if (css_enable & (1 << ssid))
kill_css(css);
else
- cgroup_clear_dir(child, 1 << ssid);
+ css_clear_dir(css, NULL);
}
}
goto out_unlock;
}
-static int cgroup_populated_show(struct seq_file *seq, void *v)
+static int cgroup_events_show(struct seq_file *seq, void *v)
{
- seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+ seq_printf(seq, "populated %d\n",
+ cgroup_is_populated(seq_css(seq)->cgroup));
return 0;
}
@@ -3153,7 +3287,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
return kernfs_setattr(kn, &iattr);
}
-static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+ struct cftype *cft)
{
char name[CGROUP_FILE_NAME_MAX];
struct kernfs_node *kn;
@@ -3175,33 +3310,38 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
return ret;
}
- if (cft->write == cgroup_procs_write)
- cgrp->procs_kn = kn;
- else if (cft->seq_show == cgroup_populated_show)
- cgrp->populated_kn = kn;
+ if (cft->file_offset) {
+ struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+ kernfs_get(kn);
+ cfile->kn = kn;
+ list_add(&cfile->node, &css->files);
+ }
+
return 0;
}
/**
* cgroup_addrm_files - add or remove files to a cgroup directory
- * @cgrp: the target cgroup
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
* @cfts: array of cftypes to be added
* @is_add: whether to add or remove
*
* Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails. If addition fails, this
- * function doesn't remove files already added. The caller is responsible
- * for cleaning up.
+ * For removals, this function never fails.
*/
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+ struct cgroup *cgrp, struct cftype cfts[],
bool is_add)
{
- struct cftype *cft;
+ struct cftype *cft, *cft_end = NULL;
int ret;
lockdep_assert_held(&cgroup_mutex);
- for (cft = cfts; cft->name[0] != '\0'; cft++) {
+restart:
+ for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
@@ -3213,11 +3353,13 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
continue;
if (is_add) {
- ret = cgroup_add_file(cgrp, cft);
+ ret = cgroup_add_file(css, cgrp, cft);
if (ret) {
pr_warn("%s: failed to add %s, err=%d\n",
__func__, cft->name, ret);
- return ret;
+ cft_end = cft;
+ is_add = false;
+ goto restart;
}
} else {
cgroup_rm_file(cgrp, cft);
@@ -3243,7 +3385,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
if (cgroup_is_dead(cgrp))
continue;
- ret = cgroup_addrm_files(cgrp, cfts, is_add);
+ ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
if (ret)
break;
}
@@ -3355,7 +3497,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
- if (ss->disabled)
+ if (!cgroup_ssid_enabled(ss->id))
return 0;
if (!cfts || cfts[0].name[0] == '\0')
@@ -3405,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
- /*
- * If legacy_flies_on_dfl, we want to show the legacy files on the
- * dfl hierarchy but iff the target subsystem hasn't been updated
- * for the dfl hierarchy yet.
- */
- if (!cgroup_legacy_files_on_dfl ||
- ss->dfl_cftypes != ss->legacy_cftypes) {
- for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
- cft->flags |= __CFTYPE_NOT_ON_DFL;
- }
-
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
return cgroup_add_cftypes(ss, cfts);
}
@@ -3430,10 +3563,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
int count = 0;
struct cgrp_cset_link *link;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
return count;
}
@@ -3665,22 +3798,25 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
}
/**
- * css_advance_task_iter - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task itererator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
*/
-static void css_advance_task_iter(struct css_task_iter *it)
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
{
struct list_head *l = it->cset_pos;
struct cgrp_cset_link *link;
struct css_set *cset;
+ lockdep_assert_held(&css_set_lock);
+
/* Advance to the next non-empty css_set */
do {
l = l->next;
if (l == it->cset_head) {
it->cset_pos = NULL;
+ it->task_pos = NULL;
return;
}
@@ -3691,7 +3827,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
link = list_entry(l, struct cgrp_cset_link, cset_link);
cset = link->cset;
}
- } while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+ } while (!css_set_populated(cset));
it->cset_pos = l;
@@ -3702,6 +3838,52 @@ static void css_advance_task_iter(struct css_task_iter *it)
it->tasks_head = &cset->tasks;
it->mg_tasks_head = &cset->mg_tasks;
+
+ /*
+ * We don't keep css_sets locked across iteration steps and thus
+ * need to take steps to ensure that iteration can be resumed after
+ * the lock is re-acquired. Iteration is performed at two levels -
+ * css_sets and tasks in them.
+ *
+ * Once created, a css_set never leaves its cgroup lists, so a
+ * pinned css_set is guaranteed to stay put and we can resume
+ * iteration afterwards.
+ *
+ * Tasks may leave @cset across iteration steps. This is resolved
+ * by registering each iterator with the css_set currently being
+ * walked and making css_set_move_task() advance iterators whose
+ * next task is leaving.
+ */
+ if (it->cur_cset) {
+ list_del(&it->iters_node);
+ put_css_set_locked(it->cur_cset);
+ }
+ get_css_set(cset);
+ it->cur_cset = cset;
+ list_add(&it->iters_node, &cset->task_iters);
+}
+
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+ struct list_head *l = it->task_pos;
+
+ lockdep_assert_held(&css_set_lock);
+ WARN_ON_ONCE(!l);
+
+ /*
+ * Advance iterator to find next entry. cset->tasks is consumed
+ * first and then ->mg_tasks. After ->mg_tasks, we move onto the
+ * next cset.
+ */
+ l = l->next;
+
+ if (l == it->tasks_head)
+ l = it->mg_tasks_head->next;
+
+ if (l == it->mg_tasks_head)
+ css_task_iter_advance_css_set(it);
+ else
+ it->task_pos = l;
}
/**
@@ -3713,19 +3895,16 @@ static void css_advance_task_iter(struct css_task_iter *it)
* css_task_iter_next() to walk through the tasks until the function
* returns NULL. On completion of iteration, css_task_iter_end() must be
* called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes. The caller can't sleep while iteration is in
- * progress.
*/
void css_task_iter_start(struct cgroup_subsys_state *css,
struct css_task_iter *it)
- __acquires(css_set_rwsem)
{
/* no one should try to iterate before mounting cgroups */
WARN_ON_ONCE(!use_task_css_set_links);
- down_read(&css_set_rwsem);
+ memset(it, 0, sizeof(*it));
+
+ spin_lock_bh(&css_set_lock);
it->ss = css->ss;
@@ -3736,7 +3915,9 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
it->cset_head = it->cset_pos;
- css_advance_task_iter(it);
+ css_task_iter_advance_css_set(it);
+
+ spin_unlock_bh(&css_set_lock);
}
/**
@@ -3749,30 +3930,23 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
*/
struct task_struct *css_task_iter_next(struct css_task_iter *it)
{
- struct task_struct *res;
- struct list_head *l = it->task_pos;
+ if (it->cur_task) {
+ put_task_struct(it->cur_task);
+ it->cur_task = NULL;
+ }
- /* If the iterator cg is NULL, we have no tasks */
- if (!it->cset_pos)
- return NULL;
- res = list_entry(l, struct task_struct, cg_list);
+ spin_lock_bh(&css_set_lock);
- /*
- * Advance iterator to find next entry. cset->tasks is consumed
- * first and then ->mg_tasks. After ->mg_tasks, we move onto the
- * next cset.
- */
- l = l->next;
+ if (it->task_pos) {
+ it->cur_task = list_entry(it->task_pos, struct task_struct,
+ cg_list);
+ get_task_struct(it->cur_task);
+ css_task_iter_advance(it);
+ }
- if (l == it->tasks_head)
- l = it->mg_tasks_head->next;
+ spin_unlock_bh(&css_set_lock);
- if (l == it->mg_tasks_head)
- css_advance_task_iter(it);
- else
- it->task_pos = l;
-
- return res;
+ return it->cur_task;
}
/**
@@ -3782,9 +3956,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
* Finish task iteration started by css_task_iter_start().
*/
void css_task_iter_end(struct css_task_iter *it)
- __releases(css_set_rwsem)
{
- up_read(&css_set_rwsem);
+ if (it->cur_cset) {
+ spin_lock_bh(&css_set_lock);
+ list_del(&it->iters_node);
+ put_css_set_locked(it->cur_cset);
+ spin_unlock_bh(&css_set_lock);
+ }
+
+ if (it->cur_task)
+ put_task_struct(it->cur_task);
}
/**
@@ -3809,10 +3990,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
if (ret)
@@ -3830,7 +4011,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
css_task_iter_end(&it);
if (task) {
- ret = cgroup_migrate(to, task, false);
+ ret = cgroup_migrate(task, false, to);
put_task_struct(task);
}
} while (task && !ret);
@@ -4327,13 +4508,13 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
+ .file_offset = offsetof(struct cgroup, procs_file),
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write = cgroup_procs_write,
- .mode = S_IRUGO | S_IWUSR,
},
{
.name = "cgroup.controllers",
@@ -4351,9 +4532,10 @@ static struct cftype cgroup_dfl_base_files[] = {
.write = cgroup_subtree_control_write,
},
{
- .name = "cgroup.populated",
+ .name = "cgroup.events",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = cgroup_populated_show,
+ .file_offset = offsetof(struct cgroup, events_file),
+ .seq_show = cgroup_events_show,
},
{ } /* terminate */
};
@@ -4368,7 +4550,6 @@ static struct cftype cgroup_legacy_base_files[] = {
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_PROCS,
.write = cgroup_procs_write,
- .mode = S_IRUGO | S_IWUSR,
},
{
.name = "cgroup.clone_children",
@@ -4388,7 +4569,6 @@ static struct cftype cgroup_legacy_base_files[] = {
.seq_show = cgroup_pidlist_show,
.private = CGROUP_FILE_TASKS,
.write = cgroup_tasks_write,
- .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -4405,37 +4585,6 @@ static struct cftype cgroup_legacy_base_files[] = {
{ } /* terminate */
};
-/**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
- *
- * On failure, no file is added.
- */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-{
- struct cgroup_subsys *ss;
- int i, ret = 0;
-
- /* process cftsets of each subsystem */
- for_each_subsys(ss, i) {
- struct cftype *cfts;
-
- if (!(subsys_mask & (1 << i)))
- continue;
-
- list_for_each_entry(cfts, &ss->cfts, node) {
- ret = cgroup_addrm_files(cgrp, cfts, true);
- if (ret < 0)
- goto err;
- }
- }
- return 0;
-err:
- cgroup_clear_dir(cgrp, subsys_mask);
- return ret;
-}
-
/*
* css destruction is four-stage process.
*
@@ -4464,9 +4613,13 @@ static void css_free_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup_subsys *ss = css->ss;
struct cgroup *cgrp = css->cgroup;
+ struct cgroup_file *cfile;
percpu_ref_exit(&css->refcnt);
+ list_for_each_entry(cfile, &css->files, node)
+ kernfs_put(cfile->kn);
+
if (ss) {
/* css free path */
int id = css->id;
@@ -4571,6 +4724,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css->ss = ss;
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
+ INIT_LIST_HEAD(&css->files);
css->serial_nr = css_serial_nr_next++;
if (cgroup_parent(cgrp)) {
@@ -4653,7 +4807,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
css->id = err;
if (visible) {
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ err = css_populate_dir(css, NULL);
if (err)
goto err_free_id;
}
@@ -4679,7 +4833,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
err_list_del:
list_del_rcu(&css->sibling);
- cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+ css_clear_dir(css, NULL);
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
@@ -4696,7 +4850,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
- struct cftype *base_files;
int ssid, ret;
/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4772,12 +4925,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- if (cgroup_on_dfl(cgrp))
- base_files = cgroup_dfl_base_files;
- else
- base_files = cgroup_legacy_base_files;
-
- ret = cgroup_addrm_files(cgrp, base_files, true);
+ ret = css_populate_dir(&cgrp->self, NULL);
if (ret)
goto out_destroy;
@@ -4864,7 +5012,7 @@ static void kill_css(struct cgroup_subsys_state *css)
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
*/
- cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+ css_clear_dir(css, NULL);
/*
* Killing would put the base ref, but we need to keep it alive
@@ -4913,19 +5061,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
{
struct cgroup_subsys_state *css;
- bool empty;
int ssid;
lockdep_assert_held(&cgroup_mutex);
/*
- * css_set_rwsem synchronizes access to ->cset_links and prevents
- * @cgrp from being removed while put_css_set() is in progress.
+ * Only migration can raise populated from zero and we're already
+ * holding cgroup_mutex.
*/
- down_read(&css_set_rwsem);
- empty = list_empty(&cgrp->cset_links);
- up_read(&css_set_rwsem);
- if (!empty)
+ if (cgroup_is_populated(cgrp))
return -EBUSY;
/*
@@ -5023,6 +5167,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
+ have_free_callback |= (bool)ss->free << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
@@ -5071,6 +5216,8 @@ int __init cgroup_init_early(void)
return 0;
}
+static unsigned long cgroup_disable_mask __initdata;
+
/**
* cgroup_init - cgroup initialization
*
@@ -5081,8 +5228,9 @@ int __init cgroup_init(void)
{
struct cgroup_subsys *ss;
unsigned long key;
- int ssid, err;
+ int ssid;
+ BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
@@ -5116,14 +5264,15 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (ss->disabled)
+ if (cgroup_disable_mask & (1 << ssid)) {
+ static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+ printk(KERN_INFO "Disabling %s control group subsystem\n",
+ ss->name);
continue;
+ }
cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
- ss->dfl_cftypes = ss->legacy_cftypes;
-
if (!ss->dfl_cftypes)
cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
@@ -5138,17 +5287,10 @@ int __init cgroup_init(void)
ss->bind(init_css_set.subsys[ssid]);
}
- err = sysfs_create_mount_point(fs_kobj, "cgroup");
- if (err)
- return err;
+ WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+ WARN_ON(register_filesystem(&cgroup_fs_type));
+ WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
- err = register_filesystem(&cgroup_fs_type);
- if (err < 0) {
- sysfs_remove_mount_point(fs_kobj, "cgroup");
- return err;
- }
-
- proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
return 0;
}
@@ -5195,7 +5337,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
goto out;
mutex_lock(&cgroup_mutex);
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
for_each_root(root) {
struct cgroup_subsys *ss;
@@ -5215,19 +5357,39 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
seq_printf(m, "%sname=%s", count ? "," : "",
root->name);
seq_putc(m, ':');
+
cgrp = task_cgroup_from_root(tsk, root);
- path = cgroup_path(cgrp, buf, PATH_MAX);
- if (!path) {
- retval = -ENAMETOOLONG;
- goto out_unlock;
+
+ /*
+ * On traditional hierarchies, all zombie tasks show up as
+ * belonging to the root cgroup. On the default hierarchy,
+ * while a zombie doesn't show up in "cgroup.procs" and
+ * thus can't be migrated, its /proc/PID/cgroup keeps
+ * reporting the cgroup it belonged to before exiting. If
+ * the cgroup is removed before the zombie is reaped,
+ * " (deleted)" is appended to the cgroup path.
+ */
+ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+ path = cgroup_path(cgrp, buf, PATH_MAX);
+ if (!path) {
+ retval = -ENAMETOOLONG;
+ goto out_unlock;
+ }
+ } else {
+ path = "/";
}
+
seq_puts(m, path);
- seq_putc(m, '\n');
+
+ if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+ seq_puts(m, " (deleted)\n");
+ else
+ seq_putc(m, '\n');
}
retval = 0;
out_unlock:
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
mutex_unlock(&cgroup_mutex);
kfree(buf);
out:
@@ -5251,7 +5413,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
ss->legacy_name, ss->root->hierarchy_id,
- atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+ atomic_read(&ss->root->nr_cgrps),
+ cgroup_ssid_enabled(i));
mutex_unlock(&cgroup_mutex);
return 0;
@@ -5372,7 +5535,7 @@ void cgroup_post_fork(struct task_struct *child,
* @child during its iteration.
*
* If we won the race, @child is associated with %current's
- * css_set. Grabbing css_set_rwsem guarantees both that the
+ * css_set. Grabbing css_set_lock guarantees both that the
* association is stable, and, on completion of the parent's
* migration, @child is visible in the source of migration or
* already in the destination cgroup. This guarantee is necessary
@@ -5387,14 +5550,13 @@ void cgroup_post_fork(struct task_struct *child,
if (use_task_css_set_links) {
struct css_set *cset;
- down_write(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
- rcu_assign_pointer(child->cgroups, cset);
- list_add(&child->cg_list, &cset->tasks);
get_css_set(cset);
+ css_set_move_task(child, NULL, cset, false);
}
- up_write(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
}
/*
@@ -5429,39 +5591,42 @@ void cgroup_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
struct css_set *cset;
- bool put_cset = false;
int i;
/*
* Unlink from @tsk from its css_set. As migration path can't race
- * with us, we can check cg_list without grabbing css_set_rwsem.
+ * with us, we can check css_set and cg_list without synchronization.
*/
+ cset = task_css_set(tsk);
+
if (!list_empty(&tsk->cg_list)) {
- down_write(&css_set_rwsem);
- list_del_init(&tsk->cg_list);
- up_write(&css_set_rwsem);
- put_cset = true;
+ spin_lock_bh(&css_set_lock);
+ css_set_move_task(tsk, cset, NULL, false);
+ spin_unlock_bh(&css_set_lock);
+ } else {
+ get_css_set(cset);
}
- /* Reassign the task to the init_css_set. */
- cset = task_css_set(tsk);
- RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-
/* see cgroup_post_fork() for details */
- for_each_subsys_which(ss, i, &have_exit_callback) {
- struct cgroup_subsys_state *old_css = cset->subsys[i];
- struct cgroup_subsys_state *css = task_css(tsk, i);
+ for_each_subsys_which(ss, i, &have_exit_callback)
+ ss->exit(tsk);
+}
- ss->exit(css, old_css, tsk);
- }
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
+ struct cgroup_subsys *ss;
+ int ssid;
- if (put_cset)
- put_css_set(cset);
+ for_each_subsys_which(ss, ssid, &have_free_callback)
+ ss->free(task);
+
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
!css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
schedule_work(&cgrp->release_agent_work);
}
@@ -5540,25 +5705,13 @@ static int __init cgroup_disable(char *str)
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
-
- ss->disabled = 1;
- printk(KERN_INFO "Disabling %s control group subsystem\n",
- ss->name);
- break;
+ cgroup_disable_mask |= 1 << i;
}
}
return 1;
}
__setup("cgroup_disable=", cgroup_disable);
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
- printk("cgroup: using legacy files on the default hierarchy\n");
- cgroup_legacy_files_on_dfl = true;
- return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
-
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5662,7 +5815,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
if (!name_buf)
return -ENOMEM;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5673,7 +5826,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
kfree(name_buf);
return 0;
}
@@ -5684,7 +5837,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- down_read(&css_set_rwsem);
+ spin_lock_bh(&css_set_lock);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
@@ -5707,13 +5860,13 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
overflow:
seq_puts(seq, " ...\n");
}
- up_read(&css_set_rwsem);
+ spin_unlock_bh(&css_set_lock);
return 0;
}
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return (!cgroup_has_tasks(css->cgroup) &&
+ return (!cgroup_is_populated(css->cgroup) &&
!css_has_online_children(&css->cgroup->self));
}
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..cdd8df4e991c 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
css_put(old_css);
}
-static void pids_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
+static void pids_free(struct task_struct *task)
{
- struct pids_cgroup *pids = css_pids(old_css);
+ struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
pids_uncharge(pids, 1);
}
@@ -349,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
.fork = pids_fork,
- .exit = pids_exit,
+ .free = pids_free,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
};
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 0a495ab35bc7..d8560ee3bab7 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -58,36 +58,13 @@ static void context_tracking_recursion_exit(void)
* instructions to execute won't use any RCU read side critical section
* because this function sets RCU in extended quiescent state.
*/
-void context_tracking_enter(enum ctx_state state)
+void __context_tracking_enter(enum ctx_state state)
{
- unsigned long flags;
-
- /*
- * Repeat the user_enter() check here because some archs may be calling
- * this from asm and if no CPU needs context tracking, they shouldn't
- * go further. Repeat the check here until they support the inline static
- * key check.
- */
- if (!context_tracking_is_enabled())
- return;
-
- /*
- * Some contexts may involve an exception occuring in an irq,
- * leading to that nesting:
- * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
- * This would mess up the dyntick_nesting count though. And rcu_irq_*()
- * helpers are enough to protect RCU uses inside the exception. So
- * just return immediately if we detect we are in an IRQ.
- */
- if (in_interrupt())
- return;
-
/* Kernel threads aren't supposed to go to userspace */
WARN_ON_ONCE(!current->mm);
- local_irq_save(flags);
if (!context_tracking_recursion_enter())
- goto out_irq_restore;
+ return;
if ( __this_cpu_read(context_tracking.state) != state) {
if (__this_cpu_read(context_tracking.active)) {
@@ -120,7 +97,27 @@ void context_tracking_enter(enum ctx_state state)
__this_cpu_write(context_tracking.state, state);
}
context_tracking_recursion_exit();
-out_irq_restore:
+}
+NOKPROBE_SYMBOL(__context_tracking_enter);
+EXPORT_SYMBOL_GPL(__context_tracking_enter);
+
+void context_tracking_enter(enum ctx_state state)
+{
+ unsigned long flags;
+
+ /*
+ * Some contexts may involve an exception occuring in an irq,
+ * leading to that nesting:
+ * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit()
+ * This would mess up the dyntick_nesting count though. And rcu_irq_*()
+ * helpers are enough to protect RCU uses inside the exception. So
+ * just return immediately if we detect we are in an IRQ.
+ */
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ __context_tracking_enter(state);
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_enter);
@@ -128,7 +125,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter);
void context_tracking_user_enter(void)
{
- context_tracking_enter(CONTEXT_USER);
+ user_enter();
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
@@ -144,19 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter);
* This call supports re-entrancy. This way it can be called from any exception
* handler without needing to know if we came from userspace or not.
*/
-void context_tracking_exit(enum ctx_state state)
+void __context_tracking_exit(enum ctx_state state)
{
- unsigned long flags;
-
- if (!context_tracking_is_enabled())
- return;
-
- if (in_interrupt())
- return;
-
- local_irq_save(flags);
if (!context_tracking_recursion_enter())
- goto out_irq_restore;
+ return;
if (__this_cpu_read(context_tracking.state) == state) {
if (__this_cpu_read(context_tracking.active)) {
@@ -173,7 +161,19 @@ void context_tracking_exit(enum ctx_state state)
__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
}
context_tracking_recursion_exit();
-out_irq_restore:
+}
+NOKPROBE_SYMBOL(__context_tracking_exit);
+EXPORT_SYMBOL_GPL(__context_tracking_exit);
+
+void context_tracking_exit(enum ctx_state state)
+{
+ unsigned long flags;
+
+ if (in_interrupt())
+ return;
+
+ local_irq_save(flags);
+ __context_tracking_exit(state);
local_irq_restore(flags);
}
NOKPROBE_SYMBOL(context_tracking_exit);
@@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit);
void context_tracking_user_exit(void)
{
- context_tracking_exit(CONTEXT_USER);
+ user_exit();
}
NOKPROBE_SYMBOL(context_tracking_user_exit);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f0acff0f66c9..10ae73611d80 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_cpuset_subset(trial, par))
goto out;
/*
@@ -497,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
* be changed to have empty cpus_allowed or mems_allowed.
*/
ret = -ENOSPC;
- if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+ if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
if (!cpumask_empty(cur->cpus_allowed) &&
cpumask_empty(trial->cpus_allowed))
goto out;
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_on_dfl(css->cgroup) &&
+ if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1484,9 +1487,8 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
{
/* static buf protected by cpuset_mutex */
static nodemask_t cpuset_attach_nodemask_to;
- struct mm_struct *mm;
struct task_struct *task;
- struct task_struct *leader = cgroup_taskset_first(tset);
+ struct task_struct *leader;
struct cpuset *cs = css_cs(css);
struct cpuset *oldcs = cpuset_attach_old_cs;
@@ -1512,26 +1514,30 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
}
/*
- * Change mm, possibly for multiple threads in a threadgroup. This is
- * expensive and may sleep.
+ * Change mm for all threadgroup leaders. This is expensive and may
+ * sleep and should be moved outside migration path proper.
*/
cpuset_attach_nodemask_to = cs->effective_mems;
- mm = get_task_mm(leader);
- if (mm) {
- mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
-
- /*
- * old_mems_allowed is the same with mems_allowed here, except
- * if this task is being moved automatically due to hotplug.
- * In that case @mems_allowed has been updated and is empty,
- * so @old_mems_allowed is the right nodesets that we migrate
- * mm from.
- */
- if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
- &cpuset_attach_nodemask_to);
+ cgroup_taskset_for_each_leader(leader, tset) {
+ struct mm_struct *mm = get_task_mm(leader);
+
+ if (mm) {
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+ /*
+ * old_mems_allowed is the same with mems_allowed
+ * here, except if this task is being moved
+ * automatically due to hotplug. In that case
+ * @mems_allowed has been updated and is empty, so
+ * @old_mems_allowed is the right nodesets that we
+ * migrate mm from.
+ */
+ if (is_memory_migrate(cs)) {
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+ &cpuset_attach_nodemask_to);
+ }
+ mmput(mm);
}
- mmput(mm);
}
cs->old_mems_allowed = cpuset_attach_nodemask_to;
@@ -1594,9 +1600,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
case FILE_MEMORY_PRESSURE_ENABLED:
cpuset_memory_pressure_enabled = !!val;
break;
- case FILE_MEMORY_PRESSURE:
- retval = -EACCES;
- break;
case FILE_SPREAD_PAGE:
retval = update_flag(CS_SPREAD_PAGE, cs, val);
break;
@@ -1863,9 +1866,6 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE,
- .mode = S_IRUGO,
},
{
@@ -1952,7 +1952,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_on_dfl(cs->css.cgroup)) {
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2029,7 +2029,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_on_dfl(root_css->cgroup)) {
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2210,7 +2210,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_on_dfl(cs->css.cgroup))
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2241,7 +2241,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+ bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
mutex_lock(&cpuset_mutex);
@@ -2598,22 +2598,22 @@ int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
}
/**
- * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed
- * @tsk: pointer to task_struct of some task.
+ * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed
*
- * Description: Prints @task's name, cpuset name, and cached copy of its
+ * Description: Prints current's name, cpuset name, and cached copy of its
* mems_allowed to the kernel log.
*/
-void cpuset_print_task_mems_allowed(struct task_struct *tsk)
+void cpuset_print_current_mems_allowed(void)
{
struct cgroup *cgrp;
rcu_read_lock();
- cgrp = task_cs(tsk)->css.cgroup;
- pr_info("%s cpuset=", tsk->comm);
+ cgrp = task_cs(current)->css.cgroup;
+ pr_info("%s cpuset=", current->comm);
pr_cont_cgroup_name(cgrp);
- pr_cont(" mems_allowed=%*pbl\n", nodemask_pr_args(&tsk->mems_allowed));
+ pr_cont(" mems_allowed=%*pbl\n",
+ nodemask_pr_args(&current->mems_allowed));
rcu_read_unlock();
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ea02109aee77..1a734e0adfa7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5375,9 +5375,15 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_RAW) {
if (data->raw) {
- perf_output_put(handle, data->raw->size);
- __output_copy(handle, data->raw->data,
- data->raw->size);
+ u32 raw_size = data->raw->size;
+ u32 real_size = round_up(raw_size + sizeof(u32),
+ sizeof(u64)) - sizeof(u32);
+ u64 zero = 0;
+
+ perf_output_put(handle, real_size);
+ __output_copy(handle, data->raw->data, raw_size);
+ if (real_size - raw_size)
+ __output_copy(handle, &zero, real_size - raw_size);
} else {
struct {
u32 size;
@@ -5509,8 +5515,7 @@ void perf_prepare_sample(struct perf_event_header *header,
else
size += sizeof(u32);
- WARN_ON_ONCE(size & (sizeof(u64)-1));
- header->size += size;
+ header->size += round_up(size, sizeof(u64));
}
if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@ -9455,17 +9460,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
task_function_call(task, __perf_cgroup_move, task);
}
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
-{
- task_function_call(task, __perf_cgroup_move, task);
-}
-
struct cgroup_subsys perf_event_cgrp_subsys = {
.css_alloc = perf_cgroup_css_alloc,
.css_free = perf_cgroup_css_free,
- .exit = perf_cgroup_exit,
.attach = perf_cgroup_attach,
};
#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 6ac894244d39..f97f2c449f5c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ cgroup_free(tsk);
task_numa_free(tsk);
security_task_free(tsk);
exit_creds(tsk);
@@ -454,7 +455,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
+ tmp->vm_flags &=
+ ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
@@ -1149,10 +1151,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
-#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->group_rwsem);
-#endif
-
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
diff --git a/kernel/futex.c b/kernel/futex.c
index dfc86e93c31d..684d7549825a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -276,10 +276,10 @@ static struct {
static struct {
struct fault_attr attr;
- u32 ignore_private;
+ bool ignore_private;
} fail_futex = {
.attr = FAULT_ATTR_INITIALIZER,
- .ignore_private = 0,
+ .ignore_private = false,
};
static int __init setup_fail_futex(char *str)
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index 80f4f4e56fed..011f8c4c63da 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -36,7 +36,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
c = irq_data_get_irq_chip(d);
if (!c->irq_set_affinity) {
- pr_warn_ratelimited("IRQ%u: unable to set affinity\n", d->irq);
+ pr_debug("IRQ%u: unable to set affinity\n", d->irq);
} else {
int r = irq_do_set_affinity(d, affinity, false);
if (r)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 05c2188271b8..fcab63c66905 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -199,6 +199,11 @@ static inline int irq_desc_get_node(struct irq_desc *desc)
return irq_common_data_get_node(&desc->irq_common_data);
}
+static inline int irq_desc_is_chained(struct irq_desc *desc)
+{
+ return (desc->action && desc->action == &chained_action);
+}
+
#ifdef CONFIG_PM_SLEEP
bool irq_pm_check_wakeup(struct irq_desc *desc);
void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a71175ff98d5..0eebaeef317b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1838,6 +1838,7 @@ void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
kfree(__free_percpu_irq(irq, dev_id));
chip_bus_sync_unlock(desc);
}
+EXPORT_SYMBOL_GPL(free_percpu_irq);
/**
* setup_percpu_irq - setup a per-cpu interrupt
@@ -1867,9 +1868,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
* @devname: An ascii name for the claiming device
* @dev_id: A percpu cookie passed back to the handler function
*
- * This call allocates interrupt resources, but doesn't
- * automatically enable the interrupt. It has to be done on each
- * CPU using enable_percpu_irq().
+ * This call allocates interrupt resources and enables the
+ * interrupt on the local CPU. If the interrupt is supposed to be
+ * enabled on other CPUs, it has to be done on each CPU using
+ * enable_percpu_irq().
*
* Dev_id must be globally unique. It is a per-cpu variable, and
* the handler gets called with the interrupted CPU's instance of
@@ -1908,6 +1910,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
+EXPORT_SYMBOL_GPL(request_percpu_irq);
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 21c62617a35a..cea1de0161f1 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -21,7 +21,7 @@ bool irq_pm_check_wakeup(struct irq_desc *desc)
desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
desc->depth++;
irq_disable(desc);
- pm_system_wakeup();
+ pm_system_irq_wakeup(irq_desc_get_irq(desc));
return true;
}
return false;
@@ -70,7 +70,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
static bool suspend_device_irq(struct irq_desc *desc)
{
- if (!desc->action || desc->no_suspend_depth)
+ if (!desc->action || irq_desc_is_chained(desc) ||
+ desc->no_suspend_depth)
return false;
if (irqd_is_wakeup_set(&desc->irq_data)) {
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index a916cf144b65..a2c02fd5d6d0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -475,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v)
for_each_online_cpu(j)
any_count |= kstat_irqs_cpu(i, j);
action = desc->action;
- if ((!action || action == &chained_action) && !any_count)
+ if ((!action || irq_desc_is_chained(desc)) && !any_count)
goto out;
seq_printf(p, "%*d: ", prec, i);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4c5edc357923..d873b64fbddc 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index bd9f8a03cefa..11b64a63c0f8 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -6,7 +6,7 @@
* Version 2. See the file COPYING for more details.
*/
-#define pr_fmt(fmt) "kexec: " fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/capability.h>
#include <linux/mm.h>
@@ -1027,7 +1027,7 @@ static int __init crash_notes_memory_init(void)
crash_notes = __alloc_percpu(size, align);
if (!crash_notes) {
- pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
+ pr_warn("Memory allocation for saving cpu register states failed\n");
return -ENOMEM;
}
return 0;
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 6a9a3f2a0e8e..b70ada0028d2 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -9,6 +9,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4e49cc4c9952..deae3907ac1e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -2738,7 +2738,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
return;
/* no reclaim without waiting on it */
- if (!(gfp_mask & __GFP_WAIT))
+ if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return;
/* this guy won't enter reclaim */
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9d6b55587eaa..7658d32c5c78 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -124,9 +124,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
{
void **ptr, *addr;
- ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+ ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL,
+ dev_to_node(dev));
if (!ptr)
- return NULL;
+ return ERR_PTR(-ENOMEM);
addr = memremap(offset, size, flags);
if (addr) {
@@ -141,9 +142,8 @@ EXPORT_SYMBOL(devm_memremap);
void devm_memunmap(struct device *dev, void *addr)
{
- WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
- addr));
- memunmap(addr);
+ WARN_ON(devres_release(dev, devm_memremap_release,
+ devm_memremap_match, addr));
}
EXPORT_SYMBOL(devm_memunmap);
@@ -176,8 +176,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
if (is_ram == REGION_INTERSECTS)
return __va(res->start);
- page_map = devres_alloc(devm_memremap_pages_release,
- sizeof(*page_map), GFP_KERNEL);
+ page_map = devres_alloc_node(devm_memremap_pages_release,
+ sizeof(*page_map), GFP_KERNEL, dev_to_node(dev));
if (!page_map)
return ERR_PTR(-ENOMEM);
@@ -185,7 +185,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res)
nid = dev_to_node(dev);
if (nid < 0)
- nid = 0;
+ nid = numa_mem_id();
error = arch_add_memory(nid, res->start, resource_size(res), true);
if (error) {
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index bd62f5cda746..6528a79d998d 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,6 +10,7 @@
*/
#include <linux/kernel.h>
+#include <linux/errno.h>
#include <keys/system_keyring.h>
#include <crypto/public_key.h>
#include "module-internal.h"
diff --git a/kernel/panic.c b/kernel/panic.c
index 04e91ff7560b..4579dbb7ed87 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,6 +23,7 @@
#include <linux/sysrq.h>
#include <linux/init.h>
#include <linux/nmi.h>
+#include <linux/console.h>
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
@@ -147,6 +148,15 @@ void panic(const char *fmt, ...)
bust_spinlocks(0);
+ /*
+ * We may have ended up stopping the CPU holding the lock (in
+ * smp_send_stop()) while still having some valuable data in the console
+ * buffer. Try to acquire the lock then release it regardless of the
+ * result. The release will also print the buffers out.
+ */
+ console_trylock();
+ console_unlock();
+
if (!panic_blink)
panic_blink = no_blink;
diff --git a/kernel/params.c b/kernel/params.c
index b6554aa71094..a6d6149c0fe6 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -223,7 +223,7 @@ char *parse_args(const char *doing,
int (*unknown)(char *param, char *val,
const char *doing, void *arg))
{
- char *param, *val;
+ char *param, *val, *err = NULL;
/* Chew leading spaces */
args = skip_spaces(args);
@@ -238,7 +238,7 @@ char *parse_args(const char *doing,
args = next_arg(args, &param, &val);
/* Stop at -- */
if (!val && strcmp(param, "--") == 0)
- return args;
+ return err ?: args;
irq_was_disabled = irqs_disabled();
ret = parse_one(param, val, doing, params, num,
min_level, max_level, arg, unknown);
@@ -247,24 +247,25 @@ char *parse_args(const char *doing,
doing, param);
switch (ret) {
+ case 0:
+ continue;
case -ENOENT:
pr_err("%s: Unknown parameter `%s'\n", doing, param);
- return ERR_PTR(ret);
+ break;
case -ENOSPC:
pr_err("%s: `%s' too large for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
- case 0:
break;
default:
pr_err("%s: `%s' invalid for parameter `%s'\n",
doing, val ?: "", param);
- return ERR_PTR(ret);
+ break;
}
+
+ err = ERR_PTR(ret);
}
- /* All parsed OK. */
- return NULL;
+ return err;
}
/* Lazy bastard, eh? */
@@ -325,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp)
}
EXPORT_SYMBOL(param_get_charp);
-static void param_free_charp(void *arg)
+void param_free_charp(void *arg)
{
maybe_kfree_parameter(*((char **)arg));
}
+EXPORT_SYMBOL(param_free_charp);
const struct kernel_param_ops param_ops_charp = {
.set = param_set_charp,
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 690f78f210f2..b7342a24f559 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -733,7 +733,7 @@ int hibernate(void)
* contents of memory is restored from the saved image.
*
* If this is successful, control reappears in the restored target kernel in
- * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
+ * hibernation_snapshot() which returns to hibernate(). Otherwise, the routine
* attempts to recover gracefully and make the kernel return to the normal mode
* of operation.
*/
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 63d395b5df93..b2dd4d999900 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -272,6 +272,22 @@ static inline void pm_print_times_init(void)
{
pm_print_times_enabled = !!initcall_debug;
}
+
+static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+}
+
+static ssize_t pm_wakeup_irq_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ return -EINVAL;
+}
+power_attr(pm_wakeup_irq);
+
#else /* !CONFIG_PM_SLEEP_DEBUG */
static inline void pm_print_times_init(void) {}
#endif /* CONFIG_PM_SLEEP_DEBUG */
@@ -604,6 +620,7 @@ static struct attribute * g[] = {
#endif
#ifdef CONFIG_PM_SLEEP_DEBUG
&pm_print_times_attr.attr,
+ &pm_wakeup_irq_attr.attr,
#endif
#endif
#ifdef CONFIG_FREEZER
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5235dd4e1e2f..3a970604308f 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem)
while (to_alloc-- > 0) {
struct page *page;
- page = alloc_image_page(__GFP_HIGHMEM);
+ page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM);
memory_bm_set_bit(bm, page_to_pfn(page));
}
return nr_highmem;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 7e4cda4a8dd9..f9fe133c13e2 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -35,6 +35,9 @@
const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
const char *pm_states[PM_SUSPEND_MAX];
+unsigned int pm_suspend_global_flags;
+EXPORT_SYMBOL_GPL(pm_suspend_global_flags);
+
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
@@ -493,6 +496,7 @@ static int enter_state(suspend_state_t state)
#endif
pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
+ pm_suspend_clear_flags();
error = suspend_prepare(state);
if (error)
goto Unlock;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b2066fb5b10f..12cd989dadf6 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr,
struct bio *bio;
int error = 0;
- bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
+ bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1);
bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9);
bio->bi_bdev = hib_resume_bdev;
@@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
return -ENOSPC;
if (hb) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN |
__GFP_NORETRY);
if (src) {
copy_page(src, buf);
@@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb)
ret = hib_wait_io(hb); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT |
+ src = (void *)__get_free_page(__GFP_RECLAIM |
__GFP_NOWARN |
__GFP_NORETRY);
if (src) {
@@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
last = tmp;
tmp->map = (struct swap_map_page *)
- __get_free_page(__GFP_WAIT | __GFP_HIGH);
+ __get_free_page(__GFP_RECLAIM | __GFP_HIGH);
if (!tmp->map) {
release_swap_reader(handle);
return -ENOMEM;
@@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle,
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
- __GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT | __GFP_NOWARN |
- __GFP_NORETRY);
+ __GFP_RECLAIM | __GFP_HIGH :
+ __GFP_RECLAIM | __GFP_NOWARN |
+ __GFP_NORETRY);
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 8f0324ef72ab..2ce8826f1053 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -269,6 +269,9 @@ static u32 clear_idx;
#define PREFIX_MAX 32
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
+#define LOG_LEVEL(v) ((v) & 0x07)
+#define LOG_FACILITY(v) ((v) >> 3 & 0xff)
+
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
#define LOG_ALIGN 4
@@ -517,6 +520,7 @@ int check_syslog_permissions(int type, int source)
ok:
return security_syslog(type);
}
+EXPORT_SYMBOL_GPL(check_syslog_permissions);
static void append_char(char **pp, char *e, char c)
{
@@ -611,7 +615,6 @@ struct devkmsg_user {
static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
- int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
size_t len = iov_iter_count(from);
@@ -641,12 +644,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
line = buf;
if (line[0] == '<') {
char *endp = NULL;
+ unsigned int u;
- i = simple_strtoul(line+1, &endp, 10);
+ u = simple_strtoul(line + 1, &endp, 10);
if (endp && endp[0] == '>') {
- level = i & 7;
- if (i >> 3)
- facility = i >> 3;
+ level = LOG_LEVEL(u);
+ if (LOG_FACILITY(u) != 0)
+ facility = LOG_FACILITY(u);
endp++;
len -= endp - line;
line = endp;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 787320de68e0..b760bae64cf1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -1016,6 +1016,11 @@ int ptrace_request(struct task_struct *child, long request,
break;
}
#endif
+
+ case PTRACE_SECCOMP_GET_FILTER:
+ ret = seccomp_get_filter(child, addr, datavp);
+ break;
+
default:
break;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index aa5973220ad2..4d568ac9319e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8244,13 +8244,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
sched_move_task(task);
}
-static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task)
-{
- sched_move_task(task);
-}
-
#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
@@ -8582,7 +8575,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .exit = cpu_cgroup_exit,
.legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 8cbc3db671df..26a54461bf59 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
*ut = p->utime;
*st = p->stime;
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
task_cputime(p, &cputime.utime, &cputime.stime);
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 5bd4779282df..580ac2d4024f 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -347,6 +347,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *sfilter;
int ret;
+ const bool save_orig = config_enabled(CONFIG_CHECKPOINT_RESTORE);
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
return ERR_PTR(-EINVAL);
@@ -370,7 +371,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(-ENOMEM);
ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
- seccomp_check_filter);
+ seccomp_check_filter, save_orig);
if (ret < 0) {
kfree(sfilter);
return ERR_PTR(ret);
@@ -469,7 +470,7 @@ void get_seccomp_filter(struct task_struct *tsk)
static inline void seccomp_filter_free(struct seccomp_filter *filter)
{
if (filter) {
- bpf_prog_free(filter->prog);
+ bpf_prog_destroy(filter->prog);
kfree(filter);
}
}
@@ -867,3 +868,76 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
/* prctl interface doesn't have flags, so they are always zero. */
return do_seccomp(op, 0, uargs);
}
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+ void __user *data)
+{
+ struct seccomp_filter *filter;
+ struct sock_fprog_kern *fprog;
+ long ret;
+ unsigned long count = 0;
+
+ if (!capable(CAP_SYS_ADMIN) ||
+ current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+ return -EACCES;
+ }
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ filter = task->seccomp.filter;
+ while (filter) {
+ filter = filter->prev;
+ count++;
+ }
+
+ if (filter_off >= count) {
+ ret = -ENOENT;
+ goto out;
+ }
+ count -= filter_off;
+
+ filter = task->seccomp.filter;
+ while (filter && count > 1) {
+ filter = filter->prev;
+ count--;
+ }
+
+ if (WARN_ON(count != 1 || !filter)) {
+ /* The filter tree shouldn't shrink while we're using it. */
+ ret = -ENOENT;
+ goto out;
+ }
+
+ fprog = filter->prog->orig_prog;
+ if (!fprog) {
+ /* This must be a new non-cBPF filter, since we save every
+ * every cBPF filter's orig_prog above when
+ * CONFIG_CHECKPOINT_RESTORE is enabled.
+ */
+ ret = -EMEDIUMTYPE;
+ goto out;
+ }
+
+ ret = fprog->len;
+ if (!data)
+ goto out;
+
+ get_seccomp_filter(task);
+ spin_unlock_irq(&task->sighand->siglock);
+
+ if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
+ ret = -EFAULT;
+
+ put_seccomp_filter(task);
+ return ret;
+
+out:
+ spin_unlock_irq(&task->sighand->siglock);
+ return ret;
+}
+#endif
diff --git a/kernel/signal.c b/kernel/signal.c
index 0f6bbbe77b46..c0b01fe24bbd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-/*
- * Notify the system that a driver wants to block all signals for this
- * process, and wants to be notified if any signals at all were to be
- * sent/acted upon. If the notifier routine returns non-zero, then the
- * signal will be acted upon after all. If the notifier routine returns 0,
- * then then signal will be blocked. Only one block per process is
- * allowed. priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not.
- */
-void
-block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&current->sighand->siglock, flags);
- current->notifier_mask = mask;
- current->notifier_data = priv;
- current->notifier = notifier;
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-
-/* Notify the system that blocking has ended. */
-
-void
-unblock_all_signals(void)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&current->sighand->siglock, flags);
- current->notifier = NULL;
- current->notifier_data = NULL;
- recalc_sigpending();
- spin_unlock_irqrestore(&current->sighand->siglock, flags);
-}
-
static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
{
struct sigqueue *q, *first = NULL;
@@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
{
int sig = next_signal(pending, mask);
- if (sig) {
- if (current->notifier) {
- if (sigismember(current->notifier_mask, sig)) {
- if (!(current->notifier)(current->notifier_data)) {
- clear_thread_flag(TIF_SIGPENDING);
- return 0;
- }
- }
- }
-
+ if (sig)
collect_signal(sig, pending, info);
- }
-
return sig;
}
@@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
sigset_t flush;
if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
- if (signal->flags & SIGNAL_GROUP_COREDUMP)
+ if (!(signal->flags & SIGNAL_GROUP_EXIT))
return sig == SIGKILL;
/*
* The process is in the middle of dying, nothing to do.
@@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig);
EXPORT_SYMBOL(send_sig);
EXPORT_SYMBOL(send_sig_info);
EXPORT_SYMBOL(sigprocmask);
-EXPORT_SYMBOL(block_all_signals);
-EXPORT_SYMBOL(unblock_all_signals);
-
/*
* System call entry points.
diff --git a/kernel/smp.c b/kernel/smp.c
index 07854477c164..d903c02223af 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
cpumask_var_t cpus;
int cpu, ret;
- might_sleep_if(gfp_flags & __GFP_WAIT);
+ might_sleep_if(gfpflags_allow_blocking(gfp_flags));
if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
preempt_disable();
diff --git a/kernel/sys.c b/kernel/sys.c
index fa2f2f671a5c..6af9212ab5aa 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
goto out_unlock; /* No processes for this user */
}
do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
+ if (uid_eq(task_uid(p), uid) && task_pid_vnr(p))
error = set_one_prio(p, niceval, error);
} while_each_thread(g, p);
if (!uid_eq(uid, cred->uid))
@@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
goto out_unlock; /* No processes for this user */
}
do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
+ if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a02decf15583..0623787ec67a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -194,6 +194,7 @@ cond_syscall(sys_mlock);
cond_syscall(sys_munlock);
cond_syscall(sys_mlockall);
cond_syscall(sys_munlockall);
+cond_syscall(sys_mlock2);
cond_syscall(sys_mincore);
cond_syscall(sys_madvise);
cond_syscall(sys_mremap);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e69201d8094e..dc6858d6639e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -64,6 +64,7 @@
#include <linux/binfmts.h>
#include <linux/sched/sysctl.h>
#include <linux/kexec.h>
+#include <linux/bpf.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -887,6 +888,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+ {
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
#ifdef CONFIG_SMP
{
.procname = "softlockup_all_cpu_backtrace",
@@ -897,6 +909,15 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+ {
+ .procname = "hardlockup_all_cpu_backtrace",
+ .data = &sysctl_hardlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
#endif /* CONFIG_SMP */
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -1139,6 +1160,18 @@ static struct ctl_table kern_table[] = {
.proc_handler = timer_migration_handler,
},
#endif
+#ifdef CONFIG_BPF_SYSCALL
+ {
+ .procname = "unprivileged_bpf_disabled",
+ .data = &sysctl_unprivileged_bpf_disabled,
+ .maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
+ .mode = 0644,
+ /* only handle a transition from default "0" to "1" */
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &one,
+ },
+#endif
{ }
};
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 0d8fe8b8f727..1347882d131e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -217,7 +217,7 @@ static void clocksource_watchdog(unsigned long data)
continue;
/* Check the deviation from the watchdog clocksource. */
- if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
+ if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
cs->name);
pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b1356b7ae570..d563c1960302 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
negative = (tick_error < 0);
/* Sort out the magnitude of the correction */
- tick_error = abs64(tick_error);
+ tick_error = abs(tick_error);
for (adj = 0; tick_error > interval; adj++)
tick_error >>= 1;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 74591ba9474f..bbc5d1114583 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -977,13 +977,29 @@ EXPORT_SYMBOL(add_timer);
*/
void add_timer_on(struct timer_list *timer, int cpu)
{
- struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
+ struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu);
+ struct tvec_base *base;
unsigned long flags;
timer_stats_timer_set_start_info(timer);
BUG_ON(timer_pending(timer) || !timer->function);
- spin_lock_irqsave(&base->lock, flags);
- timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
+
+ /*
+ * If @timer was on a different CPU, it should be migrated with the
+ * old base locked to prevent other operations proceeding with the
+ * wrong base locked. See lock_timer_base().
+ */
+ base = lock_timer_base(timer, &flags);
+ if (base != new_base) {
+ timer->flags |= TIMER_MIGRATING;
+
+ spin_unlock(&base->lock);
+ base = new_base;
+ spin_lock(&base->lock);
+ WRITE_ONCE(timer->flags,
+ (timer->flags & ~TIMER_BASEMASK) | cpu);
+ }
+
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
spin_unlock_irqrestore(&base->lock, flags);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1153c43428f3..e45db6b0d878 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
config BPF_EVENTS
depends on BPF_SYSCALL
- depends on KPROBE_EVENT || UPROBE_EVENT
+ depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS
bool
default y
help
@@ -635,6 +635,13 @@ config TRACE_ENUM_MAP_FILE
If unsure, say N
+config TRACING_EVENTS_GPIO
+ bool "Trace gpio events"
+ depends on GPIOLIB
+ default y
+ help
+ Enable tracing events for gpio subsystem
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 90e72a0c3047..a990824c8604 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -103,7 +103,7 @@ record_it:
memcpy((void *) t + sizeof(*t), data, len);
if (blk_tracer)
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
}
}
@@ -278,7 +278,7 @@ record_it:
memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
if (blk_tracer) {
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(blk_tr, buffer, event, 0, pc);
return;
}
}
@@ -437,7 +437,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
struct block_device *bdev,
struct blk_user_trace_setup *buts)
{
- struct blk_trace *old_bt, *bt = NULL;
+ struct blk_trace *bt = NULL;
struct dentry *dir = NULL;
int ret;
@@ -519,11 +519,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
bt->trace_state = Blktrace_setup;
ret = -EBUSY;
- old_bt = xchg(&q->blk_trace, bt);
- if (old_bt) {
- (void) xchg(&q->blk_trace, old_bt);
+ if (cmpxchg(&q->blk_trace, NULL, bt))
goto err;
- }
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
@@ -1343,6 +1340,7 @@ static const struct {
static enum print_line_t print_one_line(struct trace_iterator *iter,
bool classic)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
const struct blk_io_trace *t;
u16 what;
@@ -1351,7 +1349,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
t = te_blk_io_trace(iter->ent);
what = t->action & ((1 << BLK_TC_SHIFT) - 1);
- long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+ long_act = !!(tr->trace_flags & TRACE_ITER_VERBOSE);
log_action = classic ? &blk_log_action_classic : &blk_log_action;
if (t->action == BLK_TN_MESSAGE) {
@@ -1413,9 +1411,9 @@ blk_tracer_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
/* don't output context-info for blk_classic output */
if (bit == TRACE_BLK_OPT_CLASSIC) {
if (set)
- trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
else
- trace_flags |= TRACE_ITER_CONTEXT_INFO;
+ tr->trace_flags |= TRACE_ITER_CONTEXT_INFO;
}
return 0;
}
@@ -1481,7 +1479,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
static int blk_trace_setup_queue(struct request_queue *q,
struct block_device *bdev)
{
- struct blk_trace *old_bt, *bt = NULL;
+ struct blk_trace *bt = NULL;
int ret = -ENOMEM;
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
@@ -1497,12 +1495,9 @@ static int blk_trace_setup_queue(struct request_queue *q,
blk_trace_setup_lba(bt, bdev);
- old_bt = xchg(&q->blk_trace, bt);
- if (old_bt != NULL) {
- (void)xchg(&q->blk_trace, old_bt);
- ret = -EBUSY;
+ ret = -EBUSY;
+ if (cmpxchg(&q->blk_trace, NULL, bt))
goto free_bt;
- }
if (atomic_inc_return(&blk_probes_ref) == 1)
blk_register_tracepoints();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7c8803..4228fd3682c3 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
if (!event)
return -ENOENT;
+ /* make sure event is local and doesn't have pmu::count */
+ if (event->oncpu != smp_processor_id() ||
+ event->pmu->count)
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
@@ -207,14 +212,58 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
return perf_event_read_local(event);
}
-const struct bpf_func_proto bpf_perf_event_read_proto = {
+static const struct bpf_func_proto bpf_perf_event_read_proto = {
.func = bpf_perf_event_read,
- .gpl_only = false,
+ .gpl_only = true,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_ANYTHING,
};
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+ struct pt_regs *regs = (struct pt_regs *) (long) r1;
+ struct bpf_map *map = (struct bpf_map *) (long) r2;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+ void *data = (void *) (long) r4;
+ struct perf_sample_data sample_data;
+ struct perf_event *event;
+ struct perf_raw_record raw = {
+ .size = size,
+ .data = data,
+ };
+
+ if (unlikely(index >= array->map.max_entries))
+ return -E2BIG;
+
+ event = (struct perf_event *)array->ptrs[index];
+ if (unlikely(!event))
+ return -ENOENT;
+
+ if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
+ event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+ return -EINVAL;
+
+ if (unlikely(event->oncpu != smp_processor_id()))
+ return -EOPNOTSUPP;
+
+ perf_sample_data_init(&sample_data, 0, 0);
+ sample_data.raw = &raw;
+ perf_event_output(event, &sample_data, regs);
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto = {
+ .func = bpf_perf_event_output,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_CONST_MAP_PTR,
+ .arg3_type = ARG_ANYTHING,
+ .arg4_type = ARG_PTR_TO_STACK,
+ .arg5_type = ARG_CONST_STACK_SIZE,
+};
+
static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
{
switch (func_id) {
@@ -242,6 +291,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
return &bpf_get_smp_processor_id_proto;
case BPF_FUNC_perf_event_read:
return &bpf_perf_event_read_proto;
+ case BPF_FUNC_perf_event_output:
+ return &bpf_perf_event_output_proto;
default:
return NULL;
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 00611e95a8ee..3f743b147247 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,6 +243,11 @@ static void ftrace_sync_ipi(void *data)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static void update_function_graph_func(void);
+
+/* Both enabled by default (can be cleared by function_graph tracer flags */
+static bool fgraph_sleep_time = true;
+static bool fgraph_graph_time = true;
+
#else
static inline void update_function_graph_func(void) { }
#endif
@@ -917,7 +922,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
calltime = trace->rettime - trace->calltime;
- if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+ if (!fgraph_graph_time) {
int index;
index = trace->depth;
@@ -3420,27 +3425,35 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-static int ftrace_match(char *str, char *regex, int len, int type)
+/* Type for quick search ftrace basic regexes (globs) from filter_parse_regex */
+struct ftrace_glob {
+ char *search;
+ unsigned len;
+ int type;
+};
+
+static int ftrace_match(char *str, struct ftrace_glob *g)
{
int matched = 0;
int slen;
- switch (type) {
+ switch (g->type) {
case MATCH_FULL:
- if (strcmp(str, regex) == 0)
+ if (strcmp(str, g->search) == 0)
matched = 1;
break;
case MATCH_FRONT_ONLY:
- if (strncmp(str, regex, len) == 0)
+ if (strncmp(str, g->search, g->len) == 0)
matched = 1;
break;
case MATCH_MIDDLE_ONLY:
- if (strstr(str, regex))
+ if (strstr(str, g->search))
matched = 1;
break;
case MATCH_END_ONLY:
slen = strlen(str);
- if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
+ if (slen >= g->len &&
+ memcmp(str + slen - g->len, g->search, g->len) == 0)
matched = 1;
break;
}
@@ -3449,13 +3462,13 @@ static int ftrace_match(char *str, char *regex, int len, int type)
}
static int
-enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
+enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int clear_filter)
{
struct ftrace_func_entry *entry;
int ret = 0;
entry = ftrace_lookup_ip(hash, rec->ip);
- if (not) {
+ if (clear_filter) {
/* Do nothing if it doesn't exist */
if (!entry)
return 0;
@@ -3472,42 +3485,68 @@ enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
}
static int
-ftrace_match_record(struct dyn_ftrace *rec, char *mod,
- char *regex, int len, int type)
+ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g,
+ struct ftrace_glob *mod_g, int exclude_mod)
{
char str[KSYM_SYMBOL_LEN];
char *modname;
kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
- if (mod) {
- /* module lookup requires matching the module */
- if (!modname || strcmp(modname, mod))
+ if (mod_g) {
+ int mod_matches = (modname) ? ftrace_match(modname, mod_g) : 0;
+
+ /* blank module name to match all modules */
+ if (!mod_g->len) {
+ /* blank module globbing: modname xor exclude_mod */
+ if ((!exclude_mod) != (!modname))
+ goto func_match;
+ return 0;
+ }
+
+ /* not matching the module */
+ if (!modname || !mod_matches) {
+ if (exclude_mod)
+ goto func_match;
+ else
+ return 0;
+ }
+
+ if (mod_matches && exclude_mod)
return 0;
+func_match:
/* blank search means to match all funcs in the mod */
- if (!len)
+ if (!func_g->len)
return 1;
}
- return ftrace_match(str, regex, len, type);
+ return ftrace_match(str, func_g);
}
static int
-match_records(struct ftrace_hash *hash, char *buff,
- int len, char *mod, int not)
+match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
{
- unsigned search_len = 0;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
- int type = MATCH_FULL;
- char *search = buff;
+ struct ftrace_glob func_g = { .type = MATCH_FULL };
+ struct ftrace_glob mod_g = { .type = MATCH_FULL };
+ struct ftrace_glob *mod_match = (mod) ? &mod_g : NULL;
+ int exclude_mod = 0;
int found = 0;
int ret;
+ int clear_filter;
+
+ if (func) {
+ func_g.type = filter_parse_regex(func, len, &func_g.search,
+ &clear_filter);
+ func_g.len = strlen(func_g.search);
+ }
- if (len) {
- type = filter_parse_regex(buff, len, &search, &not);
- search_len = strlen(search);
+ if (mod) {
+ mod_g.type = filter_parse_regex(mod, strlen(mod),
+ &mod_g.search, &exclude_mod);
+ mod_g.len = strlen(mod_g.search);
}
mutex_lock(&ftrace_lock);
@@ -3516,8 +3555,8 @@ match_records(struct ftrace_hash *hash, char *buff,
goto out_unlock;
do_for_each_ftrace_rec(pg, rec) {
- if (ftrace_match_record(rec, mod, search, search_len, type)) {
- ret = enter_record(hash, rec, not);
+ if (ftrace_match_record(rec, &func_g, mod_match, exclude_mod)) {
+ ret = enter_record(hash, rec, clear_filter);
if (ret < 0) {
found = ret;
goto out_unlock;
@@ -3534,26 +3573,9 @@ match_records(struct ftrace_hash *hash, char *buff,
static int
ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
{
- return match_records(hash, buff, len, NULL, 0);
+ return match_records(hash, buff, len, NULL);
}
-static int
-ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
-{
- int not = 0;
-
- /* blank or '*' mean the same */
- if (strcmp(buff, "*") == 0)
- buff[0] = 0;
-
- /* handle the case of 'dont filter this module' */
- if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
- buff[0] = 0;
- not = 1;
- }
-
- return match_records(hash, buff, strlen(buff), mod, not);
-}
/*
* We register the module command as a template to show others how
@@ -3562,10 +3584,9 @@ ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
static int
ftrace_mod_callback(struct ftrace_hash *hash,
- char *func, char *cmd, char *param, int enable)
+ char *func, char *cmd, char *module, int enable)
{
- char *mod;
- int ret = -EINVAL;
+ int ret;
/*
* cmd == 'mod' because we only registered this func
@@ -3574,21 +3595,11 @@ ftrace_mod_callback(struct ftrace_hash *hash,
* you can tell which command was used by the cmd
* parameter.
*/
-
- /* we must have a module name */
- if (!param)
- return ret;
-
- mod = strsep(&param, ":");
- if (!strlen(mod))
- return ret;
-
- ret = ftrace_match_module_records(hash, func, mod);
+ ret = match_records(hash, func, strlen(func), module);
if (!ret)
- ret = -EINVAL;
+ return -EINVAL;
if (ret < 0)
return ret;
-
return 0;
}
@@ -3699,19 +3710,20 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
{
struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_probe *entry;
+ struct ftrace_glob func_g;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *old_hash = *orig_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
- int type, len, not;
+ int not;
unsigned long key;
int count = 0;
- char *search;
int ret;
- type = filter_parse_regex(glob, strlen(glob), &search, &not);
- len = strlen(search);
+ func_g.type = filter_parse_regex(glob, strlen(glob),
+ &func_g.search, &not);
+ func_g.len = strlen(func_g.search);
/* we do not support '!' for function probes */
if (WARN_ON(not))
@@ -3738,7 +3750,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
do_for_each_ftrace_rec(pg, rec) {
- if (!ftrace_match_record(rec, NULL, search, len, type))
+ if (!ftrace_match_record(rec, &func_g, NULL, 0))
continue;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -3811,24 +3823,24 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
+ struct ftrace_glob func_g;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
struct ftrace_hash *old_hash = *orig_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
- int type = MATCH_FULL;
- int i, len = 0;
- char *search;
- int ret;
+ int i, ret;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
- glob = NULL;
+ func_g.search = NULL;
else if (glob) {
int not;
- type = filter_parse_regex(glob, strlen(glob), &search, &not);
- len = strlen(search);
+ func_g.type = filter_parse_regex(glob, strlen(glob),
+ &func_g.search, &not);
+ func_g.len = strlen(func_g.search);
+ func_g.search = glob;
/* we do not support '!' for function probes */
if (WARN_ON(not))
@@ -3857,10 +3869,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
/* do this last, since it is the most expensive */
- if (glob) {
+ if (func_g.search) {
kallsyms_lookup(entry->ip, NULL, NULL,
NULL, str);
- if (!ftrace_match(str, glob, len, type))
+ if (!ftrace_match(str, &func_g))
continue;
}
@@ -3889,7 +3901,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
ftrace_free_entry(entry);
}
mutex_unlock(&ftrace_lock);
-
+
out_unlock:
mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
free_ftrace_hash(hash);
@@ -4605,21 +4617,21 @@ ftrace_graph_release(struct inode *inode, struct file *file)
static int
ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
{
+ struct ftrace_glob func_g;
struct dyn_ftrace *rec;
struct ftrace_page *pg;
- int search_len;
int fail = 1;
- int type, not;
- char *search;
+ int not;
bool exists;
int i;
/* decode regex */
- type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
+ func_g.type = filter_parse_regex(buffer, strlen(buffer),
+ &func_g.search, &not);
if (!not && *idx >= size)
return -EBUSY;
- search_len = strlen(search);
+ func_g.len = strlen(func_g.search);
mutex_lock(&ftrace_lock);
@@ -4630,7 +4642,7 @@ ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer)
do_for_each_ftrace_rec(pg, rec) {
- if (ftrace_match_record(rec, NULL, search, search_len, type)) {
+ if (ftrace_match_record(rec, &func_g, NULL, 0)) {
/* if it is in the array */
exists = false;
for (i = 0; i < *idx; i++) {
@@ -4783,17 +4795,6 @@ static int ftrace_cmp_ips(const void *a, const void *b)
return 0;
}
-static void ftrace_swap_ips(void *a, void *b, int size)
-{
- unsigned long *ipa = a;
- unsigned long *ipb = b;
- unsigned long t;
-
- t = *ipa;
- *ipa = *ipb;
- *ipb = t;
-}
-
static int ftrace_process_locs(struct module *mod,
unsigned long *start,
unsigned long *end)
@@ -4813,7 +4814,7 @@ static int ftrace_process_locs(struct module *mod,
return 0;
sort(start, count, sizeof(*start),
- ftrace_cmp_ips, ftrace_swap_ips);
+ ftrace_cmp_ips, NULL);
start_pg = ftrace_allocate_pages(count);
if (!start_pg)
@@ -5639,6 +5640,16 @@ static struct ftrace_ops graph_ops = {
ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
};
+void ftrace_graph_sleep_time_control(bool enable)
+{
+ fgraph_sleep_time = enable;
+}
+
+void ftrace_graph_graph_time_control(bool enable)
+{
+ fgraph_graph_time = enable;
+}
+
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
return 0;
@@ -5707,7 +5718,7 @@ ftrace_graph_probe_sched_switch(void *ignore, bool preempt,
* Does the user want to count the time a function was asleep.
* If so, do not update the time stamps.
*/
- if (trace_flags & TRACE_ITER_SLEEP_TIME)
+ if (fgraph_sleep_time)
return;
timestamp = trace_clock_local();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fc347f8b1bca..75f1d05ea82d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -829,7 +829,7 @@ rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
* writer is ever on it, the previous pointer never points
* back to the reader page.
*/
-static int rb_is_reader_page(struct buffer_page *page)
+static bool rb_is_reader_page(struct buffer_page *page)
{
struct list_head *list = page->list.prev;
@@ -2270,7 +2270,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
return skip_time_extend(event);
}
-static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event);
/**
@@ -2498,7 +2498,7 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
event->time_delta = 1;
}
-static inline int
+static inline bool
rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
@@ -3039,7 +3039,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = cpu_buffer->reader_page;
struct buffer_page *head = rb_set_head_page(cpu_buffer);
@@ -3047,7 +3047,7 @@ static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
/* In case of error, head will be NULL */
if (unlikely(!head))
- return 1;
+ return true;
return reader->read == rb_page_commit(reader) &&
(commit == reader ||
@@ -4267,7 +4267,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset);
* rind_buffer_empty - is the ring buffer empty?
* @buffer: The ring buffer to test
*/
-int ring_buffer_empty(struct ring_buffer *buffer)
+bool ring_buffer_empty(struct ring_buffer *buffer)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4285,10 +4285,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
local_irq_restore(flags);
if (!ret)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -4297,7 +4297,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
* @buffer: The ring buffer
* @cpu: The CPU buffer to test
*/
-int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+bool ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long flags;
@@ -4305,7 +4305,7 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
int ret;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
- return 1;
+ return true;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a1503a027ee2..6df9a83e20d7 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -24,8 +24,8 @@ struct rb_page {
static int wakeup_interval = 100;
static int reader_finish;
-static struct completion read_start;
-static struct completion read_done;
+static DECLARE_COMPLETION(read_start);
+static DECLARE_COMPLETION(read_done);
static struct ring_buffer *buffer;
static struct task_struct *producer;
@@ -60,12 +60,12 @@ MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer");
static int read_events;
-static int kill_test;
+static int test_error;
-#define KILL_TEST() \
+#define TEST_ERROR() \
do { \
- if (!kill_test) { \
- kill_test = 1; \
+ if (!test_error) { \
+ test_error = 1; \
WARN_ON(1); \
} \
} while (0)
@@ -75,6 +75,11 @@ enum event_status {
EVENT_DROPPED,
};
+static bool break_test(void)
+{
+ return test_error || kthread_should_stop();
+}
+
static enum event_status read_event(int cpu)
{
struct ring_buffer_event *event;
@@ -87,7 +92,7 @@ static enum event_status read_event(int cpu)
entry = ring_buffer_event_data(event);
if (*entry != cpu) {
- KILL_TEST();
+ TEST_ERROR();
return EVENT_DROPPED;
}
@@ -115,10 +120,10 @@ static enum event_status read_page(int cpu)
rpage = bpage;
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
- for (i = 0; i < commit && !kill_test; i += inc) {
+ for (i = 0; i < commit && !test_error ; i += inc) {
if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
@@ -128,7 +133,7 @@ static enum event_status read_page(int cpu)
case RINGBUF_TYPE_PADDING:
/* failed writes may be discarded events */
if (!event->time_delta)
- KILL_TEST();
+ TEST_ERROR();
inc = event->array[0] + 4;
break;
case RINGBUF_TYPE_TIME_EXTEND:
@@ -137,12 +142,12 @@ static enum event_status read_page(int cpu)
case 0:
entry = ring_buffer_event_data(event);
if (*entry != cpu) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
read++;
if (!event->array[0]) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
inc = event->array[0] + 4;
@@ -150,17 +155,17 @@ static enum event_status read_page(int cpu)
default:
entry = ring_buffer_event_data(event);
if (*entry != cpu) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
read++;
inc = ((event->type_len + 1) * 4);
}
- if (kill_test)
+ if (test_error)
break;
if (inc <= 0) {
- KILL_TEST();
+ TEST_ERROR();
break;
}
}
@@ -178,10 +183,14 @@ static void ring_buffer_consumer(void)
read_events ^= 1;
read = 0;
- while (!reader_finish && !kill_test) {
- int found;
+ /*
+ * Continue running until the producer specifically asks to stop
+ * and is ready for the completion.
+ */
+ while (!READ_ONCE(reader_finish)) {
+ int found = 1;
- do {
+ while (found && !test_error) {
int cpu;
found = 0;
@@ -193,19 +202,25 @@ static void ring_buffer_consumer(void)
else
stat = read_page(cpu);
- if (kill_test)
+ if (test_error)
break;
+
if (stat == EVENT_FOUND)
found = 1;
+
}
- } while (found && !kill_test);
+ }
+ /* Wait till the producer wakes us up when there is more data
+ * available or when the producer wants us to finish reading.
+ */
set_current_state(TASK_INTERRUPTIBLE);
if (reader_finish)
break;
schedule();
}
+ __set_current_state(TASK_RUNNING);
reader_finish = 0;
complete(&read_done);
}
@@ -263,10 +278,7 @@ static void ring_buffer_producer(void)
if (cnt % wakeup_interval)
cond_resched();
#endif
- if (kthread_should_stop())
- kill_test = 1;
-
- } while (ktime_before(end_time, timeout) && !kill_test);
+ } while (ktime_before(end_time, timeout) && !break_test());
trace_printk("End ring buffer hammer\n");
if (consumer) {
@@ -276,8 +288,6 @@ static void ring_buffer_producer(void)
/* the completions must be visible before the finish var */
smp_wmb();
reader_finish = 1;
- /* finish var visible before waking up the consumer */
- smp_wmb();
wake_up_process(consumer);
wait_for_completion(&read_done);
}
@@ -287,7 +297,7 @@ static void ring_buffer_producer(void)
entries = ring_buffer_entries(buffer);
overruns = ring_buffer_overruns(buffer);
- if (kill_test && !kthread_should_stop())
+ if (test_error)
trace_printk("ERROR!\n");
if (!disable_reader) {
@@ -368,15 +378,14 @@ static void wait_to_die(void)
static int ring_buffer_consumer_thread(void *arg)
{
- while (!kthread_should_stop() && !kill_test) {
+ while (!break_test()) {
complete(&read_start);
ring_buffer_consumer();
set_current_state(TASK_INTERRUPTIBLE);
- if (kthread_should_stop() || kill_test)
+ if (break_test())
break;
-
schedule();
}
__set_current_state(TASK_RUNNING);
@@ -389,27 +398,27 @@ static int ring_buffer_consumer_thread(void *arg)
static int ring_buffer_producer_thread(void *arg)
{
- init_completion(&read_start);
-
- while (!kthread_should_stop() && !kill_test) {
+ while (!break_test()) {
ring_buffer_reset(buffer);
if (consumer) {
- smp_wmb();
wake_up_process(consumer);
wait_for_completion(&read_start);
}
ring_buffer_producer();
- if (kill_test)
+ if (break_test())
goto out_kill;
trace_printk("Sleeping for 10 secs\n");
set_current_state(TASK_INTERRUPTIBLE);
+ if (break_test())
+ goto out_kill;
schedule_timeout(HZ * SLEEP_TIME);
}
out_kill:
+ __set_current_state(TASK_RUNNING);
if (!kthread_should_stop())
wait_to_die();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6e79408674aa..87fb9801bd9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -100,8 +100,6 @@ static DEFINE_PER_CPU(bool, trace_cmdline_save);
*/
static int tracing_disabled = 1;
-DEFINE_PER_CPU(int, ftrace_cpu_disabled);
-
cpumask_var_t __read_mostly tracing_buffer_mask;
/*
@@ -214,12 +212,10 @@ __setup("alloc_snapshot", boot_alloc_snapshot);
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
-static char *trace_boot_options __initdata;
static int __init set_trace_boot_options(char *str)
{
strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- trace_boot_options = trace_boot_options_buf;
return 0;
}
__setup("trace_options=", set_trace_boot_options);
@@ -250,6 +246,19 @@ unsigned long long ns2usecs(cycle_t nsec)
return nsec;
}
+/* trace_flags holds trace_options default values */
+#define TRACE_DEFAULT_FLAGS \
+ (FUNCTION_DEFAULT_FLAGS | \
+ TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | \
+ TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | \
+ TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | \
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS)
+
+/* trace_options that are only supported by global_trace */
+#define TOP_LEVEL_TRACE_FLAGS (TRACE_ITER_PRINTK | \
+ TRACE_ITER_PRINTK_MSGONLY | TRACE_ITER_RECORD_CMD)
+
+
/*
* The global_trace is the descriptor that holds the tracing
* buffers for the live tracing. For each CPU, it contains
@@ -262,7 +271,9 @@ unsigned long long ns2usecs(cycle_t nsec)
* pages for the buffer for that CPU. Each CPU has the same number
* of pages allocated for its buffer.
*/
-static struct trace_array global_trace;
+static struct trace_array global_trace = {
+ .trace_flags = TRACE_DEFAULT_FLAGS,
+};
LIST_HEAD(ftrace_trace_arrays);
@@ -468,11 +479,29 @@ static inline void trace_access_lock_init(void)
#endif
-/* trace_flags holds trace_options default values */
-unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
- TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
+#ifdef CONFIG_STACKTRACE
+static void __ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs);
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs);
+
+#else
+static inline void __ftrace_trace_stack(struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+}
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
+{
+}
+
+#endif
static void tracer_tracing_on(struct trace_array *tr)
{
@@ -518,7 +547,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
int alloc;
int pc;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
pc = preempt_count();
@@ -548,7 +577,7 @@ int __trace_puts(unsigned long ip, const char *str, int size)
entry->buf[size] = '\0';
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, irq_flags, 4, pc);
+ ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
return size;
}
@@ -568,7 +597,7 @@ int __trace_bputs(unsigned long ip, const char *str)
int size = sizeof(struct bputs_entry);
int pc;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
pc = preempt_count();
@@ -588,7 +617,7 @@ int __trace_bputs(unsigned long ip, const char *str)
entry->str = str;
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, irq_flags, 4, pc);
+ ftrace_trace_stack(&global_trace, buffer, irq_flags, 4, pc, NULL);
return 1;
}
@@ -834,34 +863,18 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
return nsecs / 1000;
}
+/*
+ * TRACE_FLAGS is defined as a tuple matching bit masks with strings.
+ * It uses C(a, b) where 'a' is the enum name and 'b' is the string that
+ * matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
+ * of strings in the order that the enums were defined.
+ */
+#undef C
+#define C(a, b) b
+
/* These must match the bit postions in trace_iterator_flags */
static const char *trace_options[] = {
- "print-parent",
- "sym-offset",
- "sym-addr",
- "verbose",
- "raw",
- "hex",
- "bin",
- "block",
- "stacktrace",
- "trace_printk",
- "ftrace_preempt",
- "branch",
- "annotate",
- "userstacktrace",
- "sym-userobj",
- "printk-msg-only",
- "context-info",
- "latency-format",
- "sleep-time",
- "graph-time",
- "record-cmd",
- "overwrite",
- "disable_on_free",
- "irq-info",
- "markers",
- "function-trace",
+ TRACE_FLAGS
NULL
};
@@ -1204,13 +1217,17 @@ static inline int run_tracer_selftest(struct tracer *type)
}
#endif /* CONFIG_FTRACE_STARTUP_TEST */
+static void add_tracer_options(struct trace_array *tr, struct tracer *t);
+
+static void __init apply_trace_boot_options(void);
+
/**
* register_tracer - register a tracer with the ftrace system.
* @type - the plugin for the tracer
*
* Register a new plugin tracer.
*/
-int register_tracer(struct tracer *type)
+int __init register_tracer(struct tracer *type)
{
struct tracer *t;
int ret = 0;
@@ -1253,6 +1270,7 @@ int register_tracer(struct tracer *type)
type->next = trace_types;
trace_types = type;
+ add_tracer_options(&global_trace, type);
out:
tracing_selftest_running = false;
@@ -1268,6 +1286,9 @@ int register_tracer(struct tracer *type)
/* Do we want this tracer to start on bootup? */
tracing_set_tracer(&global_trace, type->name);
default_bootup_tracer = NULL;
+
+ apply_trace_boot_options();
+
/* disable other selftests, since this will break it. */
tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1671,23 +1692,16 @@ __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *eve
ring_buffer_unlock_commit(buffer, event);
}
-static inline void
-__trace_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
+void trace_buffer_unlock_commit(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags, int pc)
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
ftrace_trace_userstack(buffer, flags, pc);
}
-
-void trace_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
-{
- __trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
static struct ring_buffer *temp_buffer;
@@ -1729,22 +1743,15 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
}
EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
-void trace_current_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event,
- unsigned long flags, int pc)
-{
- __trace_buffer_unlock_commit(buffer, event, flags, pc);
-}
-EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
-
-void trace_buffer_unlock_commit_regs(struct ring_buffer *buffer,
+void trace_buffer_unlock_commit_regs(struct trace_array *tr,
+ struct ring_buffer *buffer,
struct ring_buffer_event *event,
unsigned long flags, int pc,
struct pt_regs *regs)
{
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack_regs(buffer, flags, 0, pc, regs);
+ ftrace_trace_stack(tr, buffer, flags, 6, pc, regs);
ftrace_trace_userstack(buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
@@ -1766,10 +1773,6 @@ trace_function(struct trace_array *tr,
struct ring_buffer_event *event;
struct ftrace_entry *entry;
- /* If we are reading the ring buffer, don't trace */
- if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
- return;
-
event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
flags, pc);
if (!event)
@@ -1873,24 +1876,17 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
}
-void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc, struct pt_regs *regs)
+static inline void ftrace_trace_stack(struct trace_array *tr,
+ struct ring_buffer *buffer,
+ unsigned long flags,
+ int skip, int pc, struct pt_regs *regs)
{
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER_STACKTRACE))
return;
__ftrace_trace_stack(buffer, flags, skip, pc, regs);
}
-void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc)
-{
- if (!(trace_flags & TRACE_ITER_STACKTRACE))
- return;
-
- __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
-}
-
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
@@ -1929,7 +1925,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
struct userstack_entry *entry;
struct stack_trace trace;
- if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
/*
@@ -2173,7 +2169,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
memcpy(entry->buf, tbuffer, sizeof(u32) * len);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
}
out:
@@ -2225,7 +2221,7 @@ __trace_array_vprintk(struct ring_buffer *buffer,
memcpy(&entry->buf, tbuffer, len + 1);
if (!call_filter_check_discard(call, entry, buffer, event)) {
__buffer_unlock_commit(buffer, event);
- ftrace_trace_stack(buffer, flags, 6, pc);
+ ftrace_trace_stack(&global_trace, buffer, flags, 6, pc, NULL);
}
out:
preempt_enable_notrace();
@@ -2246,7 +2242,7 @@ int trace_array_printk(struct trace_array *tr,
int ret;
va_list ap;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -2261,7 +2257,7 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
int ret;
va_list ap;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
va_start(ap, fmt);
@@ -2602,7 +2598,7 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ unsigned long sym_flags = (global_trace.trace_flags & TRACE_ITER_SYM_MASK);
struct trace_buffer *buf = iter->trace_buffer;
struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
struct tracer *type = iter->trace;
@@ -2664,20 +2660,22 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
static void test_cpu_buff_start(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
+ struct trace_array *tr = iter->tr;
- if (!(trace_flags & TRACE_ITER_ANNOTATE))
+ if (!(tr->trace_flags & TRACE_ITER_ANNOTATE))
return;
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
return;
- if (cpumask_test_cpu(iter->cpu, iter->started))
+ if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
return;
if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
- cpumask_set_cpu(iter->cpu, iter->started);
+ if (iter->started)
+ cpumask_set_cpu(iter->cpu, iter->started);
/* Don't print started cpu buffer for the first entry of the trace */
if (iter->idx > 1)
@@ -2687,8 +2685,9 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK);
struct trace_entry *entry;
struct trace_event *event;
@@ -2698,7 +2697,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
event = ftrace_find_event(entry->type);
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
trace_print_lat_context(iter);
else
@@ -2718,13 +2717,14 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
struct trace_event *event;
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO)
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO)
trace_seq_printf(s, "%d %d %llu ",
entry->pid, iter->cpu, iter->ts);
@@ -2742,6 +2742,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
unsigned char newline = '\n';
struct trace_entry *entry;
@@ -2749,7 +2750,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
SEQ_PUT_HEX_FIELD(s, entry->pid);
SEQ_PUT_HEX_FIELD(s, iter->cpu);
SEQ_PUT_HEX_FIELD(s, iter->ts);
@@ -2771,13 +2772,14 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry;
struct trace_event *event;
entry = iter->ent;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
SEQ_PUT_FIELD(s, entry->pid);
SEQ_PUT_FIELD(s, iter->cpu);
SEQ_PUT_FIELD(s, iter->ts);
@@ -2826,6 +2828,8 @@ int trace_empty(struct trace_iterator *iter)
/* Called with trace_event_read_lock() held. */
enum print_line_t print_trace_line(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
+ unsigned long trace_flags = tr->trace_flags;
enum print_line_t ret;
if (iter->lost_events) {
@@ -2871,6 +2875,7 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
void trace_latency_header(struct seq_file *m)
{
struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
/* print nothing if the buffers are empty */
if (trace_empty(iter))
@@ -2879,13 +2884,15 @@ void trace_latency_header(struct seq_file *m)
if (iter->iter_flags & TRACE_FILE_LAT_FMT)
print_trace_header(m, iter);
- if (!(trace_flags & TRACE_ITER_VERBOSE))
+ if (!(tr->trace_flags & TRACE_ITER_VERBOSE))
print_lat_help_header(m);
}
void trace_default_header(struct seq_file *m)
{
struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+ unsigned long trace_flags = tr->trace_flags;
if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -3230,7 +3237,7 @@ static int tracing_open(struct inode *inode, struct file *file)
iter = __tracing_open(inode, file, false);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
- else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ else if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
}
@@ -3477,7 +3484,7 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
trace_opts = tr->current_trace->flags->opts;
for (i = 0; trace_options[i]; i++) {
- if (trace_flags & (1 << i))
+ if (tr->trace_flags & (1 << i))
seq_printf(m, "%s\n", trace_options[i]);
else
seq_printf(m, "no%s\n", trace_options[i]);
@@ -3542,7 +3549,7 @@ int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
- if (!!(trace_flags & mask) == !!enabled)
+ if (!!(tr->trace_flags & mask) == !!enabled)
return 0;
/* Give the tracer a chance to approve the change */
@@ -3551,9 +3558,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
return -EINVAL;
if (enabled)
- trace_flags |= mask;
+ tr->trace_flags |= mask;
else
- trace_flags &= ~mask;
+ tr->trace_flags &= ~mask;
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
@@ -3565,8 +3572,10 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
#endif
}
- if (mask == TRACE_ITER_PRINTK)
+ if (mask == TRACE_ITER_PRINTK) {
trace_printk_start_stop_comm(enabled);
+ trace_printk_control(enabled);
+ }
return 0;
}
@@ -3577,6 +3586,7 @@ static int trace_set_options(struct trace_array *tr, char *option)
int neg = 0;
int ret = -ENODEV;
int i;
+ size_t orig_len = strlen(option);
cmp = strstrip(option);
@@ -3600,9 +3610,36 @@ static int trace_set_options(struct trace_array *tr, char *option)
mutex_unlock(&trace_types_lock);
+ /*
+ * If the first trailing whitespace is replaced with '\0' by strstrip,
+ * turn it back into a space.
+ */
+ if (orig_len > strlen(option))
+ option[strlen(option)] = ' ';
+
return ret;
}
+static void __init apply_trace_boot_options(void)
+{
+ char *buf = trace_boot_options_buf;
+ char *option;
+
+ while (true) {
+ option = strsep(&buf, ",");
+
+ if (!option)
+ break;
+
+ if (*option)
+ trace_set_options(&global_trace, option);
+
+ /* Put back the comma to allow this to be called again */
+ if (buf)
+ *(buf - 1) = ',';
+ }
+}
+
static ssize_t
tracing_trace_options_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -4297,11 +4334,8 @@ int tracing_update_buffers(void)
struct trace_option_dentry;
-static struct trace_option_dentry *
-create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
-
static void
-destroy_trace_option_files(struct trace_option_dentry *topts);
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
/*
* Used to clear out the tracer before deletion of an instance.
@@ -4320,20 +4354,13 @@ static void tracing_set_nop(struct trace_array *tr)
tr->current_trace = &nop_trace;
}
-static void update_tracer_options(struct trace_array *tr, struct tracer *t)
+static void add_tracer_options(struct trace_array *tr, struct tracer *t)
{
- static struct trace_option_dentry *topts;
-
/* Only enable if the directory has been created already. */
if (!tr->dir)
return;
- /* Currently, only the top instance has options */
- if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL))
- return;
-
- destroy_trace_option_files(topts);
- topts = create_trace_option_files(tr, t);
+ create_trace_option_files(tr, t);
}
static int tracing_set_tracer(struct trace_array *tr, const char *buf)
@@ -4402,7 +4429,6 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf)
free_snapshot(tr);
}
#endif
- update_tracer_options(tr, t);
#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
@@ -4522,6 +4548,8 @@ out:
return ret;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+
static ssize_t
tracing_max_lat_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -4536,6 +4564,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
}
+#endif
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
@@ -4569,7 +4599,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
/* trace pipe does not show start of buffer */
cpumask_setall(iter->started);
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -4626,11 +4656,13 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
static unsigned int
trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
+ struct trace_array *tr = iter->tr;
+
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return POLLIN | POLLRDNORM;
- if (trace_flags & TRACE_ITER_BLOCK)
+ if (tr->trace_flags & TRACE_ITER_BLOCK)
/*
* Always select as readable when in blocking mode
*/
@@ -5047,7 +5079,7 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
/* disable tracing ? */
- if (trace_flags & TRACE_ITER_STOP_ON_FREE)
+ if (tr->trace_flags & TRACE_ITER_STOP_ON_FREE)
tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
@@ -5080,7 +5112,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (tracing_disabled)
return -EINVAL;
- if (!(trace_flags & TRACE_ITER_MARKERS))
+ if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
if (cnt > TRACE_BUF_SIZE)
@@ -5435,12 +5467,14 @@ static const struct file_operations tracing_thresh_fops = {
.llseek = generic_file_llseek,
};
+#ifdef CONFIG_TRACER_MAX_TRACE
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
.write = tracing_max_lat_write,
.llseek = generic_file_llseek,
};
+#endif
static const struct file_operations set_tracer_fops = {
.open = tracing_open_generic,
@@ -6132,13 +6166,6 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu)
#include "trace_selftest.c"
#endif
-struct trace_option_dentry {
- struct tracer_opt *opt;
- struct tracer_flags *flags;
- struct trace_array *tr;
- struct dentry *entry;
-};
-
static ssize_t
trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -6191,14 +6218,51 @@ static const struct file_operations trace_options_fops = {
.llseek = generic_file_llseek,
};
+/*
+ * In order to pass in both the trace_array descriptor as well as the index
+ * to the flag that the trace option file represents, the trace_array
+ * has a character array of trace_flags_index[], which holds the index
+ * of the bit for the flag it represents. index[0] == 0, index[1] == 1, etc.
+ * The address of this character array is passed to the flag option file
+ * read/write callbacks.
+ *
+ * In order to extract both the index and the trace_array descriptor,
+ * get_tr_index() uses the following algorithm.
+ *
+ * idx = *ptr;
+ *
+ * As the pointer itself contains the address of the index (remember
+ * index[1] == 1).
+ *
+ * Then to get the trace_array descriptor, by subtracting that index
+ * from the ptr, we get to the start of the index itself.
+ *
+ * ptr - idx == &index[0]
+ *
+ * Then a simple container_of() from that pointer gets us to the
+ * trace_array descriptor.
+ */
+static void get_tr_index(void *data, struct trace_array **ptr,
+ unsigned int *pindex)
+{
+ *pindex = *(unsigned char *)data;
+
+ *ptr = container_of(data - *pindex, struct trace_array,
+ trace_flags_index);
+}
+
static ssize_t
trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- long index = (long)filp->private_data;
+ void *tr_index = filp->private_data;
+ struct trace_array *tr;
+ unsigned int index;
char *buf;
- if (trace_flags & (1 << index))
+ get_tr_index(tr_index, &tr, &index);
+
+ if (tr->trace_flags & (1 << index))
buf = "1\n";
else
buf = "0\n";
@@ -6210,11 +6274,14 @@ static ssize_t
trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct trace_array *tr = &global_trace;
- long index = (long)filp->private_data;
+ void *tr_index = filp->private_data;
+ struct trace_array *tr;
+ unsigned int index;
unsigned long val;
int ret;
+ get_tr_index(tr_index, &tr, &index);
+
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -6298,21 +6365,39 @@ create_trace_option_file(struct trace_array *tr,
}
-static struct trace_option_dentry *
+static void
create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
struct trace_option_dentry *topts;
+ struct trace_options *tr_topts;
struct tracer_flags *flags;
struct tracer_opt *opts;
int cnt;
+ int i;
if (!tracer)
- return NULL;
+ return;
flags = tracer->flags;
if (!flags || !flags->opts)
- return NULL;
+ return;
+
+ /*
+ * If this is an instance, only create flags for tracers
+ * the instance may have.
+ */
+ if (!trace_ok_for_array(tracer, tr))
+ return;
+
+ for (i = 0; i < tr->nr_topts; i++) {
+ /*
+ * Check if these flags have already been added.
+ * Some tracers share flags.
+ */
+ if (tr->topts[i].tracer->flags == tracer->flags)
+ return;
+ }
opts = flags->opts;
@@ -6321,27 +6406,27 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
if (!topts)
- return NULL;
-
- for (cnt = 0; opts[cnt].name; cnt++)
- create_trace_option_file(tr, &topts[cnt], flags,
- &opts[cnt]);
-
- return topts;
-}
-
-static void
-destroy_trace_option_files(struct trace_option_dentry *topts)
-{
- int cnt;
+ return;
- if (!topts)
+ tr_topts = krealloc(tr->topts, sizeof(*tr->topts) * (tr->nr_topts + 1),
+ GFP_KERNEL);
+ if (!tr_topts) {
+ kfree(topts);
return;
+ }
- for (cnt = 0; topts[cnt].opt; cnt++)
- tracefs_remove(topts[cnt].entry);
+ tr->topts = tr_topts;
+ tr->topts[tr->nr_topts].tracer = tracer;
+ tr->topts[tr->nr_topts].topts = topts;
+ tr->nr_topts++;
- kfree(topts);
+ for (cnt = 0; opts[cnt].name; cnt++) {
+ create_trace_option_file(tr, &topts[cnt], flags,
+ &opts[cnt]);
+ WARN_ONCE(topts[cnt].entry == NULL,
+ "Failed to create trace option: %s",
+ opts[cnt].name);
+ }
}
static struct dentry *
@@ -6354,21 +6439,26 @@ create_trace_option_core_file(struct trace_array *tr,
if (!t_options)
return NULL;
- return trace_create_file(option, 0644, t_options, (void *)index,
- &trace_options_core_fops);
+ return trace_create_file(option, 0644, t_options,
+ (void *)&tr->trace_flags_index[index],
+ &trace_options_core_fops);
}
-static __init void create_trace_options_dir(struct trace_array *tr)
+static void create_trace_options_dir(struct trace_array *tr)
{
struct dentry *t_options;
+ bool top_level = tr == &global_trace;
int i;
t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
- for (i = 0; trace_options[i]; i++)
- create_trace_option_core_file(tr, trace_options[i], i);
+ for (i = 0; trace_options[i]; i++) {
+ if (top_level ||
+ !((1 << i) & TOP_LEVEL_TRACE_FLAGS))
+ create_trace_option_core_file(tr, trace_options[i], i);
+ }
}
static ssize_t
@@ -6435,7 +6525,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
{
enum ring_buffer_flags rb_flags;
- rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+ rb_flags = tr->trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
buf->tr = tr;
@@ -6505,6 +6595,30 @@ static void free_trace_buffers(struct trace_array *tr)
#endif
}
+static void init_trace_flags_index(struct trace_array *tr)
+{
+ int i;
+
+ /* Used by the trace options files */
+ for (i = 0; i < TRACE_FLAGS_MAX_SIZE; i++)
+ tr->trace_flags_index[i] = i;
+}
+
+static void __update_tracer_options(struct trace_array *tr)
+{
+ struct tracer *t;
+
+ for (t = trace_types; t; t = t->next)
+ add_tracer_options(tr, t);
+}
+
+static void update_tracer_options(struct trace_array *tr)
+{
+ mutex_lock(&trace_types_lock);
+ __update_tracer_options(tr);
+ mutex_unlock(&trace_types_lock);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -6530,6 +6644,8 @@ static int instance_mkdir(const char *name)
if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL))
goto out_free_tr;
+ tr->trace_flags = global_trace.trace_flags;
+
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
raw_spin_lock_init(&tr->start_lock);
@@ -6555,6 +6671,8 @@ static int instance_mkdir(const char *name)
}
init_tracer_tracefs(tr, tr->dir);
+ init_trace_flags_index(tr);
+ __update_tracer_options(tr);
list_add(&tr->list, &ftrace_trace_arrays);
@@ -6580,6 +6698,7 @@ static int instance_rmdir(const char *name)
struct trace_array *tr;
int found = 0;
int ret;
+ int i;
mutex_lock(&trace_types_lock);
@@ -6602,9 +6721,14 @@ static int instance_rmdir(const char *name)
tracing_set_nop(tr);
event_trace_del_tracer(tr);
ftrace_destroy_function_files(tr);
- debugfs_remove_recursive(tr->dir);
+ tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
+ for (i = 0; i < tr->nr_topts; i++) {
+ kfree(tr->topts[i].topts);
+ }
+ kfree(tr->topts);
+
kfree(tr->name);
kfree(tr);
@@ -6666,6 +6790,8 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("tracing_on", 0644, d_tracer,
tr, &rb_simple_fops);
+ create_trace_options_dir(tr);
+
#ifdef CONFIG_TRACER_MAX_TRACE
trace_create_file("tracing_max_latency", 0644, d_tracer,
&tr->max_latency, &tracing_max_lat_fops);
@@ -6721,7 +6847,9 @@ struct dentry *tracing_init_dentry(void)
if (tr->dir)
return NULL;
- if (WARN_ON(!debugfs_initialized()))
+ if (WARN_ON(!tracefs_initialized()) ||
+ (IS_ENABLED(CONFIG_DEBUG_FS) &&
+ WARN_ON(!debugfs_initialized())))
return ERR_PTR(-ENODEV);
/*
@@ -6861,11 +6989,7 @@ static __init int tracer_init_tracefs(void)
create_trace_instances(d_tracer);
- create_trace_options_dir(&global_trace);
-
- /* If the tracer was started via cmdline, create options for it here */
- if (global_trace.current_trace != &nop_trace)
- update_tracer_options(&global_trace, global_trace.current_trace);
+ update_tracer_options(&global_trace);
return 0;
}
@@ -6964,6 +7088,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
static atomic_t dump_running;
+ struct trace_array *tr = &global_trace;
unsigned int old_userobj;
unsigned long flags;
int cnt = 0, cpu;
@@ -6993,10 +7118,10 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+ old_userobj = tr->trace_flags & TRACE_ITER_SYM_USEROBJ;
/* don't look at user memory in panic mode */
- trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
switch (oops_dump_mode) {
case DUMP_ALL:
@@ -7059,7 +7184,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- trace_flags |= old_userobj;
+ tr->trace_flags |= old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
@@ -7074,6 +7199,12 @@ __init static int tracer_alloc_buffers(void)
int ring_buf_size;
int ret = -ENOMEM;
+ /*
+ * Make sure we don't accidently add more trace options
+ * than we have bits for.
+ */
+ BUILD_BUG_ON(TRACE_ITER_LAST_BIT > TRACE_FLAGS_MAX_SIZE);
+
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
@@ -7132,6 +7263,8 @@ __init static int tracer_alloc_buffers(void)
ftrace_init_global_array_ops(&global_trace);
+ init_trace_flags_index(&global_trace);
+
register_tracer(&nop_trace);
/* All seems OK, enable tracing */
@@ -7148,12 +7281,7 @@ __init static int tracer_alloc_buffers(void)
INIT_LIST_HEAD(&global_trace.events);
list_add(&global_trace.list, &ftrace_trace_arrays);
- while (trace_boot_options) {
- char *option;
-
- option = strsep(&trace_boot_options, ",");
- trace_set_options(&global_trace, option);
- }
+ apply_trace_boot_options();
register_snapshot_cmd();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 74bde81601a9..919d9d07686f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -71,9 +71,6 @@ enum trace_type {
tstruct \
}
-#undef TP_ARGS
-#define TP_ARGS(args...) args
-
#undef FTRACE_ENTRY_DUP
#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk, filter)
@@ -156,9 +153,12 @@ struct trace_array_cpu {
pid_t pid;
kuid_t uid;
char comm[TASK_COMM_LEN];
+
+ bool ignore_pid;
};
struct tracer;
+struct trace_option_dentry;
struct trace_buffer {
struct trace_array *tr;
@@ -168,6 +168,19 @@ struct trace_buffer {
int cpu;
};
+#define TRACE_FLAGS_MAX_SIZE 32
+
+struct trace_options {
+ struct tracer *tracer;
+ struct trace_option_dentry *topts;
+};
+
+struct trace_pid_list {
+ unsigned int nr_pids;
+ int order;
+ pid_t *pids;
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
@@ -193,6 +206,7 @@ struct trace_array {
bool allocated_snapshot;
unsigned long max_latency;
#endif
+ struct trace_pid_list __rcu *filtered_pids;
/*
* max_lock is used to protect the swapping of buffers
* when taking a max snapshot. The buffers themselves are
@@ -216,13 +230,17 @@ struct trace_array {
#endif
int stop_count;
int clock_id;
+ int nr_topts;
struct tracer *current_trace;
+ unsigned int trace_flags;
+ unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
struct dentry *dir;
struct dentry *options;
struct dentry *percpu_dir;
struct dentry *event_dir;
+ struct trace_options *topts;
struct list_head systems;
struct list_head events;
cpumask_var_t tracing_cpumask; /* only trace on set CPUs */
@@ -333,6 +351,13 @@ struct tracer_flags {
#define TRACER_OPT(s, b) .name = #s, .bit = b
+struct trace_option_dentry {
+ struct tracer_opt *opt;
+ struct tracer_flags *flags;
+ struct trace_array *tr;
+ struct dentry *entry;
+};
+
/**
* struct tracer - a specific tracer and its callbacks to interact with tracefs
* @name: the name chosen to select it on the available_tracers file
@@ -611,29 +636,12 @@ void update_max_tr_single(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef CONFIG_STACKTRACE
-void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc);
-
-void ftrace_trace_stack_regs(struct ring_buffer *buffer, unsigned long flags,
- int skip, int pc, struct pt_regs *regs);
-
void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
int pc);
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_stack(struct ring_buffer *buffer,
- unsigned long flags, int skip, int pc)
-{
-}
-
-static inline void ftrace_trace_stack_regs(struct ring_buffer *buffer,
- unsigned long flags, int skip,
- int pc, struct pt_regs *regs)
-{
-}
-
static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
unsigned long flags, int pc)
{
@@ -659,7 +667,6 @@ extern int DYN_FTRACE_TEST_NAME2(void);
extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;
-DECLARE_PER_CPU(int, ftrace_cpu_disabled);
#ifdef CONFIG_FTRACE_STARTUP_TEST
extern int trace_selftest_startup_function(struct tracer *trace,
@@ -707,8 +714,6 @@ int trace_array_printk_buf(struct ring_buffer *buffer,
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
-extern unsigned long trace_flags;
-
extern char trace_find_mark(unsigned long long duration);
/* Standard output formatting function used for function return traces */
@@ -723,9 +728,14 @@ extern char trace_find_mark(unsigned long long duration);
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
#define TRACE_GRAPH_PRINT_IRQS 0x40
#define TRACE_GRAPH_PRINT_TAIL 0x80
+#define TRACE_GRAPH_SLEEP_TIME 0x100
+#define TRACE_GRAPH_GRAPH_TIME 0x200
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
+extern void ftrace_graph_sleep_time_control(bool enable);
+extern void ftrace_graph_graph_time_control(bool enable);
+
extern enum print_line_t
print_graph_function_flags(struct trace_iterator *iter, u32 flags);
extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
@@ -859,7 +869,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops);
#define ftrace_destroy_filter_files(ops) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */
-int ftrace_event_is_function(struct trace_event_call *call);
+bool ftrace_event_is_function(struct trace_event_call *call);
/*
* struct trace_parser - servers for reading the user input separated by spaces
@@ -897,42 +907,94 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
size_t cnt, loff_t *ppos);
/*
+ * Only create function graph options if function graph is configured.
+ */
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+# define FGRAPH_FLAGS \
+ C(DISPLAY_GRAPH, "display-graph"),
+#else
+# define FGRAPH_FLAGS
+#endif
+
+#ifdef CONFIG_BRANCH_TRACER
+# define BRANCH_FLAGS \
+ C(BRANCH, "branch"),
+#else
+# define BRANCH_FLAGS
+#endif
+
+#ifdef CONFIG_FUNCTION_TRACER
+# define FUNCTION_FLAGS \
+ C(FUNCTION, "function-trace"),
+# define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION
+#else
+# define FUNCTION_FLAGS
+# define FUNCTION_DEFAULT_FLAGS 0UL
+#endif
+
+#ifdef CONFIG_STACKTRACE
+# define STACK_FLAGS \
+ C(STACKTRACE, "stacktrace"),
+#else
+# define STACK_FLAGS
+#endif
+
+/*
* trace_iterator_flags is an enumeration that defines bit
* positions into trace_flags that controls the output.
*
* NOTE: These bits must match the trace_options array in
- * trace.c.
+ * trace.c (this macro guarantees it).
+ */
+#define TRACE_FLAGS \
+ C(PRINT_PARENT, "print-parent"), \
+ C(SYM_OFFSET, "sym-offset"), \
+ C(SYM_ADDR, "sym-addr"), \
+ C(VERBOSE, "verbose"), \
+ C(RAW, "raw"), \
+ C(HEX, "hex"), \
+ C(BIN, "bin"), \
+ C(BLOCK, "block"), \
+ C(PRINTK, "trace_printk"), \
+ C(ANNOTATE, "annotate"), \
+ C(USERSTACKTRACE, "userstacktrace"), \
+ C(SYM_USEROBJ, "sym-userobj"), \
+ C(PRINTK_MSGONLY, "printk-msg-only"), \
+ C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
+ C(LATENCY_FMT, "latency-format"), \
+ C(RECORD_CMD, "record-cmd"), \
+ C(OVERWRITE, "overwrite"), \
+ C(STOP_ON_FREE, "disable_on_free"), \
+ C(IRQ_INFO, "irq-info"), \
+ C(MARKERS, "markers"), \
+ FUNCTION_FLAGS \
+ FGRAPH_FLAGS \
+ STACK_FLAGS \
+ BRANCH_FLAGS
+
+/*
+ * By defining C, we can make TRACE_FLAGS a list of bit names
+ * that will define the bits for the flag masks.
*/
-enum trace_iterator_flags {
- TRACE_ITER_PRINT_PARENT = 0x01,
- TRACE_ITER_SYM_OFFSET = 0x02,
- TRACE_ITER_SYM_ADDR = 0x04,
- TRACE_ITER_VERBOSE = 0x08,
- TRACE_ITER_RAW = 0x10,
- TRACE_ITER_HEX = 0x20,
- TRACE_ITER_BIN = 0x40,
- TRACE_ITER_BLOCK = 0x80,
- TRACE_ITER_STACKTRACE = 0x100,
- TRACE_ITER_PRINTK = 0x200,
- TRACE_ITER_PREEMPTONLY = 0x400,
- TRACE_ITER_BRANCH = 0x800,
- TRACE_ITER_ANNOTATE = 0x1000,
- TRACE_ITER_USERSTACKTRACE = 0x2000,
- TRACE_ITER_SYM_USEROBJ = 0x4000,
- TRACE_ITER_PRINTK_MSGONLY = 0x8000,
- TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
- TRACE_ITER_LATENCY_FMT = 0x20000,
- TRACE_ITER_SLEEP_TIME = 0x40000,
- TRACE_ITER_GRAPH_TIME = 0x80000,
- TRACE_ITER_RECORD_CMD = 0x100000,
- TRACE_ITER_OVERWRITE = 0x200000,
- TRACE_ITER_STOP_ON_FREE = 0x400000,
- TRACE_ITER_IRQ_INFO = 0x800000,
- TRACE_ITER_MARKERS = 0x1000000,
- TRACE_ITER_FUNCTION = 0x2000000,
+#undef C
+#define C(a, b) TRACE_ITER_##a##_BIT
+
+enum trace_iterator_bits {
+ TRACE_FLAGS
+ /* Make sure we don't go more than we have bits for */
+ TRACE_ITER_LAST_BIT
};
/*
+ * By redefining C, we can make TRACE_FLAGS a list of masks that
+ * use the bits as defined above.
+ */
+#undef C
+#define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT)
+
+enum trace_iterator_flags { TRACE_FLAGS };
+
+/*
* TRACE_ITER_SYM_MASK masks the options in trace_flags that
* control the output of kernel symbols.
*/
@@ -946,7 +1008,7 @@ extern int enable_branch_tracing(struct trace_array *tr);
extern void disable_branch_tracing(void);
static inline int trace_branch_enable(struct trace_array *tr)
{
- if (trace_flags & TRACE_ITER_BRANCH)
+ if (tr->trace_flags & TRACE_ITER_BRANCH)
return enable_branch_tracing(tr);
return 0;
}
@@ -1269,6 +1331,7 @@ extern const char *__stop___trace_bprintk_fmt[];
extern const char *__start___tracepoint_str[];
extern const char *__stop___tracepoint_str[];
+void trace_printk_control(bool enabled);
void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 40a14cbcf8e0..0f109c4130d3 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -43,7 +43,7 @@ static void trace_do_benchmark(void)
unsigned int std = 0;
/* Only run if the tracepoint is actually active */
- if (!trace_benchmark_event_enabled())
+ if (!trace_benchmark_event_enabled() || !tracing_is_on())
return;
local_irq_disable();
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e2e12ad3186f..3a2a73716a5b 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -125,25 +125,14 @@ void disable_branch_tracing(void)
mutex_unlock(&branch_tracing_mutex);
}
-static void start_branch_trace(struct trace_array *tr)
-{
- enable_branch_tracing(tr);
-}
-
-static void stop_branch_trace(struct trace_array *tr)
-{
- disable_branch_tracing();
-}
-
static int branch_trace_init(struct trace_array *tr)
{
- start_branch_trace(tr);
- return 0;
+ return enable_branch_tracing(tr);
}
static void branch_trace_reset(struct trace_array *tr)
{
- stop_branch_trace(tr);
+ disable_branch_tracing();
}
static enum print_line_t trace_branch_print(struct trace_iterator *iter,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7ca09cdc20c2..6bbc5f652355 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,11 +15,15 @@
#include <linux/kthread.h>
#include <linux/tracefs.h>
#include <linux/uaccess.h>
+#include <linux/bsearch.h>
#include <linux/module.h>
#include <linux/ctype.h>
+#include <linux/sort.h>
#include <linux/slab.h>
#include <linux/delay.h>
+#include <trace/events/sched.h>
+
#include <asm/setup.h>
#include "trace_output.h"
@@ -38,21 +42,19 @@ static LIST_HEAD(ftrace_common_fields);
static struct kmem_cache *field_cachep;
static struct kmem_cache *file_cachep;
-#define SYSTEM_FL_FREE_NAME (1 << 31)
-
static inline int system_refcount(struct event_subsystem *system)
{
- return system->ref_count & ~SYSTEM_FL_FREE_NAME;
+ return system->ref_count;
}
static int system_refcount_inc(struct event_subsystem *system)
{
- return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
+ return system->ref_count++;
}
static int system_refcount_dec(struct event_subsystem *system)
{
- return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
+ return --system->ref_count;
}
/* Double loops, do not use break, only goto's work */
@@ -212,12 +214,32 @@ int trace_event_raw_init(struct trace_event_call *call)
}
EXPORT_SYMBOL_GPL(trace_event_raw_init);
+bool trace_event_ignore_this_pid(struct trace_event_file *trace_file)
+{
+ struct trace_array *tr = trace_file->tr;
+ struct trace_array_cpu *data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+ if (!pid_list)
+ return false;
+
+ data = this_cpu_ptr(tr->trace_buffer.data);
+
+ return data->ignore_pid;
+}
+EXPORT_SYMBOL_GPL(trace_event_ignore_this_pid);
+
void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
struct trace_event_file *trace_file,
unsigned long len)
{
struct trace_event_call *event_call = trace_file->event_call;
+ if ((trace_file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(trace_file))
+ return NULL;
+
local_save_flags(fbuffer->flags);
fbuffer->pc = preempt_count();
fbuffer->trace_file = trace_file;
@@ -338,6 +360,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
struct trace_event_call *call = file->event_call;
+ struct trace_array *tr = file->tr;
int ret = 0;
int disable;
@@ -401,7 +424,7 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
- if (trace_flags & TRACE_ITER_RECORD_CMD) {
+ if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
@@ -446,6 +469,142 @@ static void ftrace_clear_events(struct trace_array *tr)
mutex_unlock(&event_mutex);
}
+static int cmp_pid(const void *key, const void *elt)
+{
+ const pid_t *search_pid = key;
+ const pid_t *pid = elt;
+
+ if (*search_pid == *pid)
+ return 0;
+ if (*search_pid < *pid)
+ return -1;
+ return 1;
+}
+
+static bool
+check_ignore_pid(struct trace_pid_list *filtered_pids, struct task_struct *task)
+{
+ pid_t search_pid;
+ pid_t *pid;
+
+ /*
+ * Return false, because if filtered_pids does not exist,
+ * all pids are good to trace.
+ */
+ if (!filtered_pids)
+ return false;
+
+ search_pid = task->pid;
+
+ pid = bsearch(&search_pid, filtered_pids->pids,
+ filtered_pids->nr_pids, sizeof(pid_t),
+ cmp_pid);
+ if (!pid)
+ return true;
+
+ return false;
+}
+
+static void
+event_filter_pid_sched_switch_probe_pre(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, prev) &&
+ check_ignore_pid(pid_list, next));
+}
+
+static void
+event_filter_pid_sched_switch_probe_post(void *data, bool preempt,
+ struct task_struct *prev, struct task_struct *next)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, next));
+}
+
+static void
+event_filter_pid_sched_wakeup_probe_pre(void *data, struct task_struct *task)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /* Nothing to do if we are already tracing */
+ if (!this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ return;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, task));
+}
+
+static void
+event_filter_pid_sched_wakeup_probe_post(void *data, struct task_struct *task)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /* Nothing to do if we are not tracing */
+ if (this_cpu_read(tr->trace_buffer.data->ignore_pid))
+ return;
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ /* Set tracing if current is enabled */
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, current));
+}
+
+static void __ftrace_clear_event_pids(struct trace_array *tr)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_event_file *file;
+ int cpu;
+
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ if (!pid_list)
+ return;
+
+ unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_pre, tr);
+ unregister_trace_sched_switch(event_filter_pid_sched_switch_probe_post, tr);
+
+ unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr);
+ unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr);
+
+ list_for_each_entry(file, &tr->events, list) {
+ clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
+
+ for_each_possible_cpu(cpu)
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->ignore_pid = false;
+
+ rcu_assign_pointer(tr->filtered_pids, NULL);
+
+ /* Wait till all users are no longer using pid filtering */
+ synchronize_sched();
+
+ free_pages((unsigned long)pid_list->pids, pid_list->order);
+ kfree(pid_list);
+}
+
+static void ftrace_clear_event_pids(struct trace_array *tr)
+{
+ mutex_lock(&event_mutex);
+ __ftrace_clear_event_pids(tr);
+ mutex_unlock(&event_mutex);
+}
+
static void __put_system(struct event_subsystem *system)
{
struct event_filter *filter = system->filter;
@@ -460,8 +619,7 @@ static void __put_system(struct event_subsystem *system)
kfree(filter->filter_string);
kfree(filter);
}
- if (system->ref_count & SYSTEM_FL_FREE_NAME)
- kfree(system->name);
+ kfree_const(system->name);
kfree(system);
}
@@ -779,6 +937,58 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+static void *p_start(struct seq_file *m, loff_t *pos)
+ __acquires(RCU)
+{
+ struct trace_pid_list *pid_list;
+ struct trace_array *tr = m->private;
+
+ /*
+ * Grab the mutex, to keep calls to p_next() having the same
+ * tr->filtered_pids as p_start() has.
+ * If we just passed the tr->filtered_pids around, then RCU would
+ * have been enough, but doing that makes things more complex.
+ */
+ mutex_lock(&event_mutex);
+ rcu_read_lock_sched();
+
+ pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ if (!pid_list || *pos >= pid_list->nr_pids)
+ return NULL;
+
+ return (void *)&pid_list->pids[*pos];
+}
+
+static void p_stop(struct seq_file *m, void *p)
+ __releases(RCU)
+{
+ rcu_read_unlock_sched();
+ mutex_unlock(&event_mutex);
+}
+
+static void *
+p_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *pid_list = rcu_dereference_sched(tr->filtered_pids);
+
+ (*pos)++;
+
+ if (*pos >= pid_list->nr_pids)
+ return NULL;
+
+ return (void *)&pid_list->pids[*pos];
+}
+
+static int p_show(struct seq_file *m, void *v)
+{
+ pid_t *pid = v;
+
+ seq_printf(m, "%d\n", *pid);
+ return 0;
+}
+
static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -1336,8 +1546,209 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
return r;
}
+static int max_pids(struct trace_pid_list *pid_list)
+{
+ return (PAGE_SIZE << pid_list->order) / sizeof(pid_t);
+}
+
+static void ignore_task_cpu(void *data)
+{
+ struct trace_array *tr = data;
+ struct trace_pid_list *pid_list;
+
+ /*
+ * This function is called by on_each_cpu() while the
+ * event_mutex is held.
+ */
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ mutex_is_locked(&event_mutex));
+
+ this_cpu_write(tr->trace_buffer.data->ignore_pid,
+ check_ignore_pid(pid_list, current));
+}
+
+static ssize_t
+ftrace_event_pid_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
+ struct trace_pid_list *filtered_pids = NULL;
+ struct trace_pid_list *pid_list = NULL;
+ struct trace_event_file *file;
+ struct trace_parser parser;
+ unsigned long val;
+ loff_t this_pos;
+ ssize_t read = 0;
+ ssize_t ret = 0;
+ pid_t pid;
+ int i;
+
+ if (!cnt)
+ return 0;
+
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))
+ return -ENOMEM;
+
+ mutex_lock(&event_mutex);
+ /*
+ * Load as many pids into the array before doing a
+ * swap from the tr->filtered_pids to the new list.
+ */
+ while (cnt > 0) {
+
+ this_pos = 0;
+
+ ret = trace_get_user(&parser, ubuf, cnt, &this_pos);
+ if (ret < 0 || !trace_parser_loaded(&parser))
+ break;
+
+ read += ret;
+ ubuf += ret;
+ cnt -= ret;
+
+ parser.buffer[parser.idx] = 0;
+
+ ret = -EINVAL;
+ if (kstrtoul(parser.buffer, 0, &val))
+ break;
+ if (val > INT_MAX)
+ break;
+
+ pid = (pid_t)val;
+
+ ret = -ENOMEM;
+ if (!pid_list) {
+ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
+ if (!pid_list)
+ break;
+
+ filtered_pids = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+ if (filtered_pids)
+ pid_list->order = filtered_pids->order;
+ else
+ pid_list->order = 0;
+
+ pid_list->pids = (void *)__get_free_pages(GFP_KERNEL,
+ pid_list->order);
+ if (!pid_list->pids)
+ break;
+
+ if (filtered_pids) {
+ pid_list->nr_pids = filtered_pids->nr_pids;
+ memcpy(pid_list->pids, filtered_pids->pids,
+ pid_list->nr_pids * sizeof(pid_t));
+ } else
+ pid_list->nr_pids = 0;
+ }
+
+ if (pid_list->nr_pids >= max_pids(pid_list)) {
+ pid_t *pid_page;
+
+ pid_page = (void *)__get_free_pages(GFP_KERNEL,
+ pid_list->order + 1);
+ if (!pid_page)
+ break;
+ memcpy(pid_page, pid_list->pids,
+ pid_list->nr_pids * sizeof(pid_t));
+ free_pages((unsigned long)pid_list->pids, pid_list->order);
+
+ pid_list->order++;
+ pid_list->pids = pid_page;
+ }
+
+ pid_list->pids[pid_list->nr_pids++] = pid;
+ trace_parser_clear(&parser);
+ ret = 0;
+ }
+ trace_parser_put(&parser);
+
+ if (ret < 0) {
+ if (pid_list)
+ free_pages((unsigned long)pid_list->pids, pid_list->order);
+ kfree(pid_list);
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
+
+ if (!pid_list) {
+ mutex_unlock(&event_mutex);
+ return ret;
+ }
+
+ sort(pid_list->pids, pid_list->nr_pids, sizeof(pid_t), cmp_pid, NULL);
+
+ /* Remove duplicates */
+ for (i = 1; i < pid_list->nr_pids; i++) {
+ int start = i;
+
+ while (i < pid_list->nr_pids &&
+ pid_list->pids[i - 1] == pid_list->pids[i])
+ i++;
+
+ if (start != i) {
+ if (i < pid_list->nr_pids) {
+ memmove(&pid_list->pids[start], &pid_list->pids[i],
+ (pid_list->nr_pids - i) * sizeof(pid_t));
+ pid_list->nr_pids -= i - start;
+ i = start;
+ } else
+ pid_list->nr_pids = start;
+ }
+ }
+
+ rcu_assign_pointer(tr->filtered_pids, pid_list);
+
+ list_for_each_entry(file, &tr->events, list) {
+ set_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags);
+ }
+
+ if (filtered_pids) {
+ synchronize_sched();
+
+ free_pages((unsigned long)filtered_pids->pids, filtered_pids->order);
+ kfree(filtered_pids);
+ } else {
+ /*
+ * Register a probe that is called before all other probes
+ * to set ignore_pid if next or prev do not match.
+ * Register a probe this is called after all other probes
+ * to only keep ignore_pid set if next pid matches.
+ */
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_switch(event_filter_pid_sched_switch_probe_post,
+ tr, 0);
+
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre,
+ tr, INT_MAX);
+ register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post,
+ tr, 0);
+ }
+
+ /*
+ * Ignoring of pids is done at task switch. But we have to
+ * check for those tasks that are currently running.
+ * Always do this in case a pid was appended or removed.
+ */
+ on_each_cpu(ignore_task_cpu, tr, 1);
+
+ mutex_unlock(&event_mutex);
+
+ ret = read;
+ *ppos += read;
+
+ return ret;
+}
+
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_set_pid_open(struct inode *inode, struct file *file);
static int ftrace_event_release(struct inode *inode, struct file *file);
static const struct seq_operations show_event_seq_ops = {
@@ -1354,6 +1765,13 @@ static const struct seq_operations show_set_event_seq_ops = {
.stop = t_stop,
};
+static const struct seq_operations show_set_pid_seq_ops = {
+ .start = p_start,
+ .next = p_next,
+ .show = p_show,
+ .stop = p_stop,
+};
+
static const struct file_operations ftrace_avail_fops = {
.open = ftrace_event_avail_open,
.read = seq_read,
@@ -1369,6 +1787,14 @@ static const struct file_operations ftrace_set_event_fops = {
.release = ftrace_event_release,
};
+static const struct file_operations ftrace_set_event_pid_fops = {
+ .open = ftrace_event_set_pid_open,
+ .read = seq_read,
+ .write = ftrace_event_pid_write,
+ .llseek = seq_lseek,
+ .release = ftrace_event_release,
+};
+
static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
@@ -1479,6 +1905,26 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
return ret;
}
+static int
+ftrace_event_set_pid_open(struct inode *inode, struct file *file)
+{
+ const struct seq_operations *seq_ops = &show_set_pid_seq_ops;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ if ((file->f_mode & FMODE_WRITE) &&
+ (file->f_flags & O_TRUNC))
+ ftrace_clear_event_pids(tr);
+
+ ret = ftrace_event_open(inode, file, seq_ops);
+ if (ret < 0)
+ trace_array_put(tr);
+ return ret;
+}
+
static struct event_subsystem *
create_new_subsystem(const char *name)
{
@@ -1492,13 +1938,9 @@ create_new_subsystem(const char *name)
system->ref_count = 1;
/* Only allocate if dynamic (kprobes and modules) */
- if (!core_kernel_data((unsigned long)name)) {
- system->ref_count |= SYSTEM_FL_FREE_NAME;
- system->name = kstrdup(name, GFP_KERNEL);
- if (!system->name)
- goto out_free;
- } else
- system->name = name;
+ system->name = kstrdup_const(name, GFP_KERNEL);
+ if (!system->name)
+ goto out_free;
system->filter = NULL;
@@ -1511,8 +1953,7 @@ create_new_subsystem(const char *name)
return system;
out_free:
- if (system->ref_count & SYSTEM_FL_FREE_NAME)
- kfree(system->name);
+ kfree_const(system->name);
kfree(system);
return NULL;
}
@@ -2478,6 +2919,9 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
return -ENOMEM;
}
+ entry = tracefs_create_file("set_event_pid", 0644, parent,
+ tr, &ftrace_set_event_pid_fops);
+
/* ring buffer internal formats */
trace_create_file("header_page", 0444, d_events,
ring_buffer_print_page_header,
@@ -2558,6 +3002,9 @@ int event_trace_del_tracer(struct trace_array *tr)
/* Disable any event triggers and associated soft-disabled events */
clear_event_triggers(tr);
+ /* Clear the pid list */
+ __ftrace_clear_event_pids(tr);
+
/* Disable any running events */
__ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
@@ -2595,16 +3042,16 @@ early_enable_events(struct trace_array *tr, bool disable_first)
if (!token)
break;
- if (!*token)
- continue;
- /* Restarting syscalls requires that we stop them first */
- if (disable_first)
- ftrace_set_clr_event(tr, token, 0);
+ if (*token) {
+ /* Restarting syscalls requires that we stop them first */
+ if (disable_first)
+ ftrace_set_clr_event(tr, token, 0);
- ret = ftrace_set_clr_event(tr, token, 1);
- if (ret)
- pr_warn("Failed to enable trace event: %s\n", token);
+ ret = ftrace_set_clr_event(tr, token, 1);
+ if (ret)
+ pr_warn("Failed to enable trace event: %s\n", token);
+ }
/* Put back the comma to allow this to be called again */
if (buf)
@@ -2891,7 +3338,9 @@ static __init void event_trace_self_tests(void)
static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
-static void
+static struct trace_array *event_tr;
+
+static void __init
function_test_events_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
@@ -2922,7 +3371,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
- trace_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(event_tr, buffer, event, flags, pc);
out:
atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
@@ -2938,6 +3387,9 @@ static struct ftrace_ops trace_ops __initdata =
static __init void event_trace_self_test_with_function(void)
{
int ret;
+ event_tr = top_trace_array();
+ if (WARN_ON(!event_tr))
+ return;
ret = register_ftrace_function(&trace_ops);
if (WARN_ON(ret < 0)) {
pr_info("Failed to enable function tracer for event tests\n");
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index bd1bf184c5c9..f93a219b18da 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -973,15 +973,15 @@ static bool is_string_field(struct ftrace_event_field *field)
field->filter_type == FILTER_PTR_STRING;
}
-static int is_legal_op(struct ftrace_event_field *field, int op)
+static bool is_legal_op(struct ftrace_event_field *field, int op)
{
if (is_string_field(field) &&
(op != OP_EQ && op != OP_NE && op != OP_GLOB))
- return 0;
+ return false;
if (!is_string_field(field) && op == OP_GLOB)
- return 0;
+ return false;
- return 1;
+ return true;
}
static filter_pred_fn_t select_comparison_fn(int op, int field_size,
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index adabf7da9113..39aa7aa66468 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -187,7 +187,7 @@ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call;
FTRACE_ENTRY_REG(call, struct_name, etype, \
PARAMS(tstruct), PARAMS(print), filter, NULL)
-int ftrace_event_is_function(struct trace_event_call *call)
+bool ftrace_event_is_function(struct trace_event_call *call)
{
return call == &event_function;
}
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ca98445782ac..a663cbb84107 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -83,13 +83,18 @@ static struct tracer_opt trace_opts[] = {
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
/* Display function name after trailing } */
{ TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
+ /* Include sleep time (scheduled out) between entry and return */
+ { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
+ /* Include time within nested functions */
+ { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
/* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
- TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
+ TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS |
+ TRACE_GRAPH_SLEEP_TIME | TRACE_GRAPH_GRAPH_TIME,
.opts = trace_opts
};
@@ -107,8 +112,8 @@ enum {
};
static void
-print_graph_duration(unsigned long long duration, struct trace_seq *s,
- u32 flags);
+print_graph_duration(struct trace_array *tr, unsigned long long duration,
+ struct trace_seq *s, u32 flags);
/* Add a function return address to the trace stack on thread info.*/
int
@@ -283,9 +288,6 @@ int __trace_graph_entry(struct trace_array *tr,
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
- if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
- return 0;
-
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
sizeof(*entry), flags, pc);
if (!event)
@@ -398,9 +400,6 @@ void __trace_graph_return(struct trace_array *tr,
struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
- if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
- return;
-
event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
sizeof(*entry), flags, pc);
if (!event)
@@ -653,6 +652,7 @@ static void
print_graph_irq(struct trace_iterator *iter, unsigned long addr,
enum trace_type type, int cpu, pid_t pid, u32 flags)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
@@ -660,7 +660,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
addr >= (unsigned long)__irqentry_text_end)
return;
- if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+ if (tr->trace_flags & TRACE_ITER_CONTEXT_INFO) {
/* Absolute time */
if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
print_graph_abs_time(iter->ts, s);
@@ -676,19 +676,19 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
print_graph_lat_fmt(s, ent);
}
/* No overhead */
- print_graph_duration(0, s, flags | FLAGS_FILL_START);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_START);
if (type == TRACE_GRAPH_ENT)
trace_seq_puts(s, "==========>");
else
trace_seq_puts(s, "<==========");
- print_graph_duration(0, s, flags | FLAGS_FILL_END);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_END);
trace_seq_putc(s, '\n');
}
@@ -726,11 +726,11 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
}
static void
-print_graph_duration(unsigned long long duration, struct trace_seq *s,
- u32 flags)
+print_graph_duration(struct trace_array *tr, unsigned long long duration,
+ struct trace_seq *s, u32 flags)
{
if (!(flags & TRACE_GRAPH_PRINT_DURATION) ||
- !(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ !(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
/* No real adata, just filling the column with spaces */
@@ -764,6 +764,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
struct trace_seq *s, u32 flags)
{
struct fgraph_data *data = iter->private;
+ struct trace_array *tr = iter->tr;
struct ftrace_graph_ret *graph_ret;
struct ftrace_graph_ent *call;
unsigned long long duration;
@@ -792,7 +793,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
}
/* Overhead and duration */
- print_graph_duration(duration, s, flags);
+ print_graph_duration(tr, duration, s, flags);
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
@@ -810,6 +811,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
{
struct ftrace_graph_ent *call = &entry->graph_ent;
struct fgraph_data *data = iter->private;
+ struct trace_array *tr = iter->tr;
int i;
if (data) {
@@ -825,7 +827,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
}
/* No time */
- print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL);
/* Function */
for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++)
@@ -849,6 +851,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
{
struct fgraph_data *data = iter->private;
struct trace_entry *ent = iter->ent;
+ struct trace_array *tr = iter->tr;
int cpu = iter->cpu;
/* Pid */
@@ -858,7 +861,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
/* Interrupt */
print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
- if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
/* Absolute time */
@@ -876,7 +879,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
}
/* Latency format */
- if (trace_flags & TRACE_ITER_LATENCY_FMT)
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT)
print_graph_lat_fmt(s, ent);
return;
@@ -1027,6 +1030,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
{
unsigned long long duration = trace->rettime - trace->calltime;
struct fgraph_data *data = iter->private;
+ struct trace_array *tr = iter->tr;
pid_t pid = ent->pid;
int cpu = iter->cpu;
int func_match = 1;
@@ -1058,7 +1062,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
print_graph_prologue(iter, s, 0, 0, flags);
/* Overhead and duration */
- print_graph_duration(duration, s, flags);
+ print_graph_duration(tr, duration, s, flags);
/* Closing brace */
for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++)
@@ -1091,7 +1095,8 @@ static enum print_line_t
print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
struct trace_iterator *iter, u32 flags)
{
- unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
+ struct trace_array *tr = iter->tr;
+ unsigned long sym_flags = (tr->trace_flags & TRACE_ITER_SYM_MASK);
struct fgraph_data *data = iter->private;
struct trace_event *event;
int depth = 0;
@@ -1104,7 +1109,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
print_graph_prologue(iter, s, 0, 0, flags);
/* No time */
- print_graph_duration(0, s, flags | FLAGS_FILL_FULL);
+ print_graph_duration(tr, 0, s, flags | FLAGS_FILL_FULL);
/* Indentation */
if (depth > 0)
@@ -1245,9 +1250,10 @@ static void print_lat_header(struct seq_file *s, u32 flags)
seq_printf(s, "#%.*s||| / \n", size, spaces);
}
-static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
+static void __print_graph_headers_flags(struct trace_array *tr,
+ struct seq_file *s, u32 flags)
{
- int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
+ int lat = tr->trace_flags & TRACE_ITER_LATENCY_FMT;
if (lat)
print_lat_header(s, flags);
@@ -1289,11 +1295,12 @@ static void print_graph_headers(struct seq_file *s)
void print_graph_headers_flags(struct seq_file *s, u32 flags)
{
struct trace_iterator *iter = s->private;
+ struct trace_array *tr = iter->tr;
- if (!(trace_flags & TRACE_ITER_CONTEXT_INFO))
+ if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
- if (trace_flags & TRACE_ITER_LATENCY_FMT) {
+ if (tr->trace_flags & TRACE_ITER_LATENCY_FMT) {
/* print nothing if the buffers are empty */
if (trace_empty(iter))
return;
@@ -1301,7 +1308,7 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
print_trace_header(s, iter);
}
- __print_graph_headers_flags(s, flags);
+ __print_graph_headers_flags(tr, s, flags);
}
void graph_trace_open(struct trace_iterator *iter)
@@ -1362,6 +1369,12 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
if (bit == TRACE_GRAPH_PRINT_IRQS)
ftrace_graph_skip_irqs = !set;
+ if (bit == TRACE_GRAPH_SLEEP_TIME)
+ ftrace_graph_sleep_time_control(set);
+
+ if (bit == TRACE_GRAPH_GRAPH_TIME)
+ ftrace_graph_graph_time_control(set);
+
return 0;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 8523ea345f2b..e4e56589ec1d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -31,7 +31,6 @@ enum {
static int trace_type __read_mostly;
static int save_flags;
-static bool function_enabled;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -57,22 +56,16 @@ irq_trace(void)
# define irq_trace() (0)
#endif
-#define TRACE_DISPLAY_GRAPH 1
-
-static struct tracer_opt trace_opts[] = {
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- /* display latency trace as call graph */
- { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+static int irqsoff_display_graph(struct trace_array *tr, int set);
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+#else
+static inline int irqsoff_display_graph(struct trace_array *tr, int set)
+{
+ return -EINVAL;
+}
+# define is_graph(tr) false
#endif
- { } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
- .val = 0,
- .opts = trace_opts,
-};
-
-#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
/*
* Sequence count - we record it when starting a measurement and
@@ -152,15 +145,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip,
#endif /* CONFIG_FUNCTION_TRACER */
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int
-irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+static int irqsoff_display_graph(struct trace_array *tr, int set)
{
int cpu;
- if (!(bit & TRACE_DISPLAY_GRAPH))
- return -EINVAL;
-
- if (!(is_graph() ^ set))
+ if (!(is_graph(tr) ^ set))
return 0;
stop_irqsoff_tracer(irqsoff_trace, !set);
@@ -209,7 +198,7 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
static void irqsoff_trace_open(struct trace_iterator *iter)
{
- if (is_graph())
+ if (is_graph(iter->tr))
graph_trace_open(iter);
}
@@ -231,7 +220,7 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
* In graph mode call the graph tracer output function,
* otherwise go with the TRACE_FN event handler
*/
- if (is_graph())
+ if (is_graph(iter->tr))
return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
return TRACE_TYPE_UNHANDLED;
@@ -239,7 +228,9 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
static void irqsoff_print_header(struct seq_file *s)
{
- if (is_graph())
+ struct trace_array *tr = irqsoff_trace;
+
+ if (is_graph(tr))
print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
else
trace_default_header(s);
@@ -250,7 +241,7 @@ __trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
unsigned long flags, int pc)
{
- if (is_graph())
+ if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, flags, pc);
else
trace_function(tr, ip, parent_ip, flags, pc);
@@ -259,27 +250,23 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int
-irqsoff_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
-{
- return -EINVAL;
-}
-
+#ifdef CONFIG_FUNCTION_TRACER
static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
{
return -1;
}
+#endif
static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
{
return TRACE_TYPE_UNHANDLED;
}
-static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
static void irqsoff_trace_open(struct trace_iterator *iter) { }
static void irqsoff_trace_close(struct trace_iterator *iter) { }
#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
static void irqsoff_print_header(struct seq_file *s)
{
trace_default_header(s);
@@ -295,16 +282,16 @@ static void irqsoff_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
- return 0;
+ return false;
} else {
if (delta <= tr->max_latency)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static void
@@ -523,12 +510,15 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
+#ifdef CONFIG_FUNCTION_TRACER
+static bool function_enabled;
+
static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
{
int ret;
/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
return 0;
if (graph)
@@ -556,20 +546,40 @@ static void unregister_irqsoff_function(struct trace_array *tr, int graph)
function_enabled = false;
}
-static void irqsoff_function_set(struct trace_array *tr, int set)
+static int irqsoff_function_set(struct trace_array *tr, u32 mask, int set)
{
+ if (!(mask & TRACE_ITER_FUNCTION))
+ return 0;
+
if (set)
- register_irqsoff_function(tr, is_graph(), 1);
+ register_irqsoff_function(tr, is_graph(tr), 1);
else
- unregister_irqsoff_function(tr, is_graph());
+ unregister_irqsoff_function(tr, is_graph(tr));
+ return 1;
+}
+#else
+static int register_irqsoff_function(struct trace_array *tr, int graph, int set)
+{
+ return 0;
+}
+static void unregister_irqsoff_function(struct trace_array *tr, int graph) { }
+static inline int irqsoff_function_set(struct trace_array *tr, u32 mask, int set)
+{
+ return 0;
}
+#endif /* CONFIG_FUNCTION_TRACER */
static int irqsoff_flag_changed(struct trace_array *tr, u32 mask, int set)
{
struct tracer *tracer = tr->current_trace;
- if (mask & TRACE_ITER_FUNCTION)
- irqsoff_function_set(tr, set);
+ if (irqsoff_function_set(tr, mask, set))
+ return 0;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ return irqsoff_display_graph(tr, set);
+#endif
return trace_keep_overwrite(tracer, mask, set);
}
@@ -602,7 +612,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
if (irqsoff_busy)
return -EBUSY;
- save_flags = trace_flags;
+ save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
@@ -618,7 +628,7 @@ static int __irqsoff_tracer_init(struct trace_array *tr)
/* Only toplevel instance supports graph tracing */
if (start_irqsoff_tracer(tr, (tr->flags & TRACE_ARRAY_FL_GLOBAL &&
- is_graph())))
+ is_graph(tr))))
printk(KERN_ERR "failed to start irqsoff tracer\n");
irqsoff_busy = true;
@@ -630,7 +640,7 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
- stop_irqsoff_tracer(tr, is_graph());
+ stop_irqsoff_tracer(tr, is_graph(tr));
set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
@@ -666,8 +676,6 @@ static struct tracer irqsoff_tracer __read_mostly =
.print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
- .flags = &tracer_flags,
- .set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
@@ -700,8 +708,6 @@ static struct tracer preemptoff_tracer __read_mostly =
.print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
- .flags = &tracer_flags,
- .set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
@@ -736,8 +742,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.print_max = true,
.print_header = irqsoff_print_header,
.print_line = irqsoff_print_line,
- .flags = &tracer_flags,
- .set_flag = irqsoff_set_flag,
.flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3ccf5c2c1320..57149bce6aad 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -21,20 +21,22 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS];
+ struct trace_array *tr;
unsigned int old_userobj;
int cnt = 0, cpu;
trace_init_global_iter(&iter);
iter.buffer_iter = buffer_iter;
+ tr = iter.tr;
for_each_tracing_cpu(cpu) {
atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
- old_userobj = trace_flags;
+ old_userobj = tr->trace_flags;
/* don't look at user memory in panic mode */
- trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+ tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
kdb_printf("Dumping ftrace buffer:\n");
@@ -82,7 +84,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
kdb_printf("---------------------------------\n");
out:
- trace_flags = old_userobj;
+ tr->trace_flags = old_userobj;
for_each_tracing_cpu(cpu) {
atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 638e110c5bfd..2be8c4f2403d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -314,7 +314,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
entry->rw = *rw;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
}
void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -344,7 +344,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
entry->map = *map;
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, 0, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, 0, pc);
}
void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e481a84aeea..282982195e09 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -322,8 +322,8 @@ seq_print_sym_offset(struct trace_seq *s, const char *fmt,
# define IP_FMT "%016lx"
#endif
-int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
- unsigned long ip, unsigned long sym_flags)
+static int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+ unsigned long ip, unsigned long sym_flags)
{
struct file *file = NULL;
unsigned long vmstart = 0;
@@ -355,50 +355,6 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
}
int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
- unsigned long sym_flags)
-{
- struct mm_struct *mm = NULL;
- unsigned int i;
-
- if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
- struct task_struct *task;
- /*
- * we do the lookup on the thread group leader,
- * since individual threads might have already quit!
- */
- rcu_read_lock();
- task = find_task_by_vpid(entry->tgid);
- if (task)
- mm = get_task_mm(task);
- rcu_read_unlock();
- }
-
- for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
- unsigned long ip = entry->caller[i];
-
- if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
- break;
-
- trace_seq_puts(s, " => ");
-
- if (!ip) {
- trace_seq_puts(s, "??");
- trace_seq_putc(s, '\n');
- continue;
- }
-
- seq_print_user_ip(s, mm, ip, sym_flags);
- trace_seq_putc(s, '\n');
- }
-
- if (mm)
- mmput(mm);
-
- return !trace_seq_has_overflowed(s);
-}
-
-int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
{
if (!ip) {
@@ -520,7 +476,8 @@ char trace_find_mark(unsigned long long d)
static int
lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
- unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
+ struct trace_array *tr = iter->tr;
+ unsigned long verbose = tr->trace_flags & TRACE_ITER_VERBOSE;
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
@@ -563,6 +520,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
int trace_print_context(struct trace_iterator *iter)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *entry = iter->ent;
unsigned long long t;
@@ -574,7 +532,7 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
- if (trace_flags & TRACE_ITER_IRQ_INFO)
+ if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
if (iter->iter_flags & TRACE_FILE_TIME_IN_NS) {
@@ -590,14 +548,15 @@ int trace_print_context(struct trace_iterator *iter)
int trace_print_lat_context(struct trace_iterator *iter)
{
- u64 next_ts;
+ struct trace_array *tr = iter->tr;
/* trace_find_next_entry will reset ent_size */
int ent_size = iter->ent_size;
struct trace_seq *s = &iter->seq;
+ u64 next_ts;
struct trace_entry *entry = iter->ent,
*next_entry = trace_find_next_entry(iter, NULL,
&next_ts);
- unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+ unsigned long verbose = (tr->trace_flags & TRACE_ITER_VERBOSE);
/* Restore the original ent_size */
iter->ent_size = ent_size;
@@ -1079,13 +1038,49 @@ static struct trace_event trace_stack_event = {
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
int flags, struct trace_event *event)
{
+ struct trace_array *tr = iter->tr;
struct userstack_entry *field;
struct trace_seq *s = &iter->seq;
+ struct mm_struct *mm = NULL;
+ unsigned int i;
trace_assign_type(field, iter->ent);
trace_seq_puts(s, "<user stack trace>\n");
- seq_print_userip_objs(field, s, flags);
+
+ if (tr->trace_flags & TRACE_ITER_SYM_USEROBJ) {
+ struct task_struct *task;
+ /*
+ * we do the lookup on the thread group leader,
+ * since individual threads might have already quit!
+ */
+ rcu_read_lock();
+ task = find_task_by_vpid(field->tgid);
+ if (task)
+ mm = get_task_mm(task);
+ rcu_read_unlock();
+ }
+
+ for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+ unsigned long ip = field->caller[i];
+
+ if (ip == ULONG_MAX || trace_seq_has_overflowed(s))
+ break;
+
+ trace_seq_puts(s, " => ");
+
+ if (!ip) {
+ trace_seq_puts(s, "??");
+ trace_seq_putc(s, '\n');
+ continue;
+ }
+
+ seq_print_user_ip(s, mm, ip, flags);
+ trace_seq_putc(s, '\n');
+ }
+
+ if (mm)
+ mmput(mm);
return trace_handle_return(s);
}
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 4cbfe85b99c8..fabc49bcd493 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -14,10 +14,6 @@ trace_print_printk_msg_only(struct trace_iterator *iter);
extern int
seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
unsigned long sym_flags);
-extern int seq_print_userip_objs(const struct userstack_entry *entry,
- struct trace_seq *s, unsigned long sym_flags);
-extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
- unsigned long ip, unsigned long sym_flags);
extern int trace_print_context(struct trace_iterator *iter);
extern int trace_print_lat_context(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 36c1455b7567..1c2b28536feb 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -178,6 +178,12 @@ static inline void format_mod_start(void) { }
static inline void format_mod_stop(void) { }
#endif /* CONFIG_MODULES */
+static bool __read_mostly trace_printk_enabled = true;
+
+void trace_printk_control(bool enabled)
+{
+ trace_printk_enabled = enabled;
+}
__initdata_or_module static
struct notifier_block module_trace_bprintk_format_nb = {
@@ -192,7 +198,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
if (unlikely(!fmt))
return 0;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
va_start(ap, fmt);
@@ -207,7 +213,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
if (unlikely(!fmt))
return 0;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
return trace_vbprintk(ip, fmt, ap);
@@ -219,7 +225,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
int ret;
va_list ap;
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
va_start(ap, fmt);
@@ -231,7 +237,7 @@ EXPORT_SYMBOL_GPL(__trace_printk);
int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
{
- if (!(trace_flags & TRACE_ITER_PRINTK))
+ if (!trace_printk_enabled)
return 0;
return trace_vprintk(ip, fmt, ap);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index b98dee914542..f6398db09114 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -302,15 +302,15 @@ static nokprobe_inline void call_fetch(struct fetch_param *fprm,
}
/* Check the name is good for event/group/fields */
-static inline int is_good_name(const char *name)
+static inline bool is_good_name(const char *name)
{
if (!isalpha(*name) && *name != '_')
- return 0;
+ return false;
while (*++name != '\0') {
if (!isalpha(*name) && !isdigit(*name) && *name != '_')
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static inline struct event_file_link *
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4bcfbac289ff..9d4399b553a3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -34,31 +34,28 @@ static arch_spinlock_t wakeup_lock =
static void wakeup_reset(struct trace_array *tr);
static void __wakeup_reset(struct trace_array *tr);
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
-static void wakeup_graph_return(struct ftrace_graph_ret *trace);
static int save_flags;
-static bool function_enabled;
-
-#define TRACE_DISPLAY_GRAPH 1
-static struct tracer_opt trace_opts[] = {
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- /* display latency trace as call graph */
- { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
+static int wakeup_display_graph(struct trace_array *tr, int set);
+# define is_graph(tr) ((tr)->trace_flags & TRACE_ITER_DISPLAY_GRAPH)
+#else
+static inline int wakeup_display_graph(struct trace_array *tr, int set)
+{
+ return 0;
+}
+# define is_graph(tr) false
#endif
- { } /* Empty entry */
-};
-
-static struct tracer_flags tracer_flags = {
- .val = 0,
- .opts = trace_opts,
-};
-#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
#ifdef CONFIG_FUNCTION_TRACER
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
+static void wakeup_graph_return(struct ftrace_graph_ret *trace);
+
+static bool function_enabled;
+
/*
* Prologue for the wakeup function tracers.
*
@@ -128,14 +125,13 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip,
atomic_dec(&data->disabled);
preempt_enable_notrace();
}
-#endif /* CONFIG_FUNCTION_TRACER */
static int register_wakeup_function(struct trace_array *tr, int graph, int set)
{
int ret;
/* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
- if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ if (function_enabled || (!set && !(tr->trace_flags & TRACE_ITER_FUNCTION)))
return 0;
if (graph)
@@ -163,20 +159,40 @@ static void unregister_wakeup_function(struct trace_array *tr, int graph)
function_enabled = false;
}
-static void wakeup_function_set(struct trace_array *tr, int set)
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
{
+ if (!(mask & TRACE_ITER_FUNCTION))
+ return 0;
+
if (set)
- register_wakeup_function(tr, is_graph(), 1);
+ register_wakeup_function(tr, is_graph(tr), 1);
else
- unregister_wakeup_function(tr, is_graph());
+ unregister_wakeup_function(tr, is_graph(tr));
+ return 1;
+}
+#else
+static int register_wakeup_function(struct trace_array *tr, int graph, int set)
+{
+ return 0;
+}
+static void unregister_wakeup_function(struct trace_array *tr, int graph) { }
+static int wakeup_function_set(struct trace_array *tr, u32 mask, int set)
+{
+ return 0;
}
+#endif /* CONFIG_FUNCTION_TRACER */
static int wakeup_flag_changed(struct trace_array *tr, u32 mask, int set)
{
struct tracer *tracer = tr->current_trace;
- if (mask & TRACE_ITER_FUNCTION)
- wakeup_function_set(tr, set);
+ if (wakeup_function_set(tr, mask, set))
+ return 0;
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ if (mask & TRACE_ITER_DISPLAY_GRAPH)
+ return wakeup_display_graph(tr, set);
+#endif
return trace_keep_overwrite(tracer, mask, set);
}
@@ -203,14 +219,9 @@ static void stop_func_tracer(struct trace_array *tr, int graph)
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static int
-wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
+static int wakeup_display_graph(struct trace_array *tr, int set)
{
-
- if (!(bit & TRACE_DISPLAY_GRAPH))
- return -EINVAL;
-
- if (!(is_graph() ^ set))
+ if (!(is_graph(tr) ^ set))
return 0;
stop_func_tracer(tr, !set);
@@ -259,7 +270,7 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace)
static void wakeup_trace_open(struct trace_iterator *iter)
{
- if (is_graph())
+ if (is_graph(iter->tr))
graph_trace_open(iter);
}
@@ -279,7 +290,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
* In graph mode call the graph tracer output function,
* otherwise go with the TRACE_FN event handler
*/
- if (is_graph())
+ if (is_graph(iter->tr))
return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
return TRACE_TYPE_UNHANDLED;
@@ -287,7 +298,7 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
static void wakeup_print_header(struct seq_file *s)
{
- if (is_graph())
+ if (is_graph(wakeup_trace))
print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
else
trace_default_header(s);
@@ -298,7 +309,7 @@ __trace_function(struct trace_array *tr,
unsigned long ip, unsigned long parent_ip,
unsigned long flags, int pc)
{
- if (is_graph())
+ if (is_graph(tr))
trace_graph_function(tr, ip, parent_ip, flags, pc);
else
trace_function(tr, ip, parent_ip, flags, pc);
@@ -306,27 +317,20 @@ __trace_function(struct trace_array *tr,
#else
#define __trace_function trace_function
-static int
-wakeup_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
-{
- return -EINVAL;
-}
-
-static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
-{
- return -1;
-}
-
static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
{
return TRACE_TYPE_UNHANDLED;
}
-static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
static void wakeup_trace_open(struct trace_iterator *iter) { }
static void wakeup_trace_close(struct trace_iterator *iter) { }
#ifdef CONFIG_FUNCTION_TRACER
+static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
+{
+ return -1;
+}
+static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
static void wakeup_print_header(struct seq_file *s)
{
trace_default_header(s);
@@ -342,16 +346,16 @@ static void wakeup_print_header(struct seq_file *s)
/*
* Should this new latency be reported/recorded?
*/
-static int report_latency(struct trace_array *tr, cycle_t delta)
+static bool report_latency(struct trace_array *tr, cycle_t delta)
{
if (tracing_thresh) {
if (delta < tracing_thresh)
- return 0;
+ return false;
} else {
if (delta <= tr->max_latency)
- return 0;
+ return false;
}
- return 1;
+ return true;
}
static void
@@ -388,7 +392,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
}
static void
@@ -416,7 +420,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))
- trace_buffer_unlock_commit(buffer, event, flags, pc);
+ trace_buffer_unlock_commit(tr, buffer, event, flags, pc);
}
static void notrace
@@ -635,7 +639,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
*/
smp_wmb();
- if (start_func_tracer(tr, is_graph()))
+ if (start_func_tracer(tr, is_graph(tr)))
printk(KERN_ERR "failed to start wakeup tracer\n");
return;
@@ -648,7 +652,7 @@ fail_deprobe:
static void stop_wakeup_tracer(struct trace_array *tr)
{
tracer_enabled = 0;
- stop_func_tracer(tr, is_graph());
+ stop_func_tracer(tr, is_graph(tr));
unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -659,7 +663,7 @@ static bool wakeup_busy;
static int __wakeup_tracer_init(struct trace_array *tr)
{
- save_flags = trace_flags;
+ save_flags = tr->trace_flags;
/* non overwrite screws up the latency tracers */
set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
@@ -740,8 +744,6 @@ static struct tracer wakeup_tracer __read_mostly =
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
- .flags = &tracer_flags,
- .set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -762,8 +764,6 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
- .flags = &tracer_flags,
- .set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
@@ -784,8 +784,6 @@ static struct tracer wakeup_dl_tracer __read_mostly =
.print_max = true,
.print_header = wakeup_print_header,
.print_line = wakeup_print_line,
- .flags = &tracer_flags,
- .set_flag = wakeup_set_flag,
.flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8abf1ba18085..dda9e6742950 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -16,24 +16,22 @@
#include "trace.h"
-#define STACK_TRACE_ENTRIES 500
-
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
-static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+unsigned stack_trace_index[STACK_TRACE_ENTRIES];
/*
* Reserve one entry for the passed in ip. This will allow
* us to remove most or all of the stack size overhead
* added by the stack tracer itself.
*/
-static struct stack_trace max_stack_trace = {
+struct stack_trace stack_trace_max = {
.max_entries = STACK_TRACE_ENTRIES - 1,
.entries = &stack_dump_trace[0],
};
-static unsigned long max_stack_size;
-static arch_spinlock_t max_stack_lock =
+unsigned long stack_trace_max_size;
+arch_spinlock_t stack_trace_max_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
static DEFINE_PER_CPU(int, trace_active);
@@ -42,30 +40,38 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
-static inline void print_max_stack(void)
+void stack_trace_print(void)
{
long i;
int size;
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries);
+ stack_trace_max.nr_entries);
- for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ for (i = 0; i < stack_trace_max.nr_entries; i++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
- if (i+1 == max_stack_trace.nr_entries ||
+ if (i+1 == stack_trace_max.nr_entries ||
stack_dump_trace[i+1] == ULONG_MAX)
- size = stack_dump_index[i];
+ size = stack_trace_index[i];
else
- size = stack_dump_index[i] - stack_dump_index[i+1];
+ size = stack_trace_index[i] - stack_trace_index[i+1];
- pr_emerg("%3ld) %8d %5d %pS\n", i, stack_dump_index[i],
+ pr_emerg("%3ld) %8d %5d %pS\n", i, stack_trace_index[i],
size, (void *)stack_dump_trace[i]);
}
}
-static inline void
+/*
+ * When arch-specific code overides this function, the following
+ * data should be filled up, assuming stack_trace_max_lock is held to
+ * prevent concurrent updates.
+ * stack_trace_index[]
+ * stack_trace_max
+ * stack_trace_max_size
+ */
+void __weak
check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags; unsigned long *p, *top, *start;
@@ -78,7 +84,7 @@ check_stack(unsigned long ip, unsigned long *stack)
/* Remove the frame of the tracer */
this_size -= frame_size;
- if (this_size <= max_stack_size)
+ if (this_size <= stack_trace_max_size)
return;
/* we do not handle interrupt stacks yet */
@@ -90,7 +96,7 @@ check_stack(unsigned long ip, unsigned long *stack)
return;
local_irq_save(flags);
- arch_spin_lock(&max_stack_lock);
+ arch_spin_lock(&stack_trace_max_lock);
/*
* RCU may not be watching, make it see us.
@@ -103,18 +109,18 @@ check_stack(unsigned long ip, unsigned long *stack)
this_size -= tracer_frame;
/* a race could have already updated it */
- if (this_size <= max_stack_size)
+ if (this_size <= stack_trace_max_size)
goto out;
- max_stack_size = this_size;
+ stack_trace_max_size = this_size;
- max_stack_trace.nr_entries = 0;
- max_stack_trace.skip = 3;
+ stack_trace_max.nr_entries = 0;
+ stack_trace_max.skip = 3;
- save_stack_trace(&max_stack_trace);
+ save_stack_trace(&stack_trace_max);
/* Skip over the overhead of the stack tracer itself */
- for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ for (i = 0; i < stack_trace_max.nr_entries; i++) {
if (stack_dump_trace[i] == ip)
break;
}
@@ -134,18 +140,18 @@ check_stack(unsigned long ip, unsigned long *stack)
* loop will only happen once. This code only takes place
* on a new max, so it is far from a fast path.
*/
- while (i < max_stack_trace.nr_entries) {
+ while (i < stack_trace_max.nr_entries) {
int found = 0;
- stack_dump_index[x] = this_size;
+ stack_trace_index[x] = this_size;
p = start;
- for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ for (; p < top && i < stack_trace_max.nr_entries; p++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
if (*p == stack_dump_trace[i]) {
stack_dump_trace[x] = stack_dump_trace[i++];
- this_size = stack_dump_index[x++] =
+ this_size = stack_trace_index[x++] =
(top - p) * sizeof(unsigned long);
found = 1;
/* Start the search from here */
@@ -160,7 +166,7 @@ check_stack(unsigned long ip, unsigned long *stack)
if (unlikely(!tracer_frame)) {
tracer_frame = (p - stack) *
sizeof(unsigned long);
- max_stack_size -= tracer_frame;
+ stack_trace_max_size -= tracer_frame;
}
}
}
@@ -169,18 +175,18 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
- max_stack_trace.nr_entries = x;
+ stack_trace_max.nr_entries = x;
for (; x < i; x++)
stack_dump_trace[x] = ULONG_MAX;
if (task_stack_end_corrupted(current)) {
- print_max_stack();
+ stack_trace_print();
BUG();
}
out:
rcu_irq_exit();
- arch_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&stack_trace_max_lock);
local_irq_restore(flags);
}
@@ -251,9 +257,9 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
cpu = smp_processor_id();
per_cpu(trace_active, cpu)++;
- arch_spin_lock(&max_stack_lock);
+ arch_spin_lock(&stack_trace_max_lock);
*ptr = val;
- arch_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&stack_trace_max_lock);
per_cpu(trace_active, cpu)--;
local_irq_restore(flags);
@@ -273,7 +279,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n > stack_trace_max.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
m->private = (void *)n;
@@ -296,7 +302,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
cpu = smp_processor_id();
per_cpu(trace_active, cpu)++;
- arch_spin_lock(&max_stack_lock);
+ arch_spin_lock(&stack_trace_max_lock);
if (*pos == 0)
return SEQ_START_TOKEN;
@@ -308,7 +314,7 @@ static void t_stop(struct seq_file *m, void *p)
{
int cpu;
- arch_spin_unlock(&max_stack_lock);
+ arch_spin_unlock(&stack_trace_max_lock);
cpu = smp_processor_id();
per_cpu(trace_active, cpu)--;
@@ -343,9 +349,9 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries);
+ stack_trace_max.nr_entries);
- if (!stack_tracer_enabled && !max_stack_size)
+ if (!stack_tracer_enabled && !stack_trace_max_size)
print_disabled(m);
return 0;
@@ -353,17 +359,17 @@ static int t_show(struct seq_file *m, void *v)
i = *(long *)v;
- if (i >= max_stack_trace.nr_entries ||
+ if (i >= stack_trace_max.nr_entries ||
stack_dump_trace[i] == ULONG_MAX)
return 0;
- if (i+1 == max_stack_trace.nr_entries ||
+ if (i+1 == stack_trace_max.nr_entries ||
stack_dump_trace[i+1] == ULONG_MAX)
- size = stack_dump_index[i];
+ size = stack_trace_index[i];
else
- size = stack_dump_index[i] - stack_dump_index[i+1];
+ size = stack_trace_index[i] - stack_trace_index[i+1];
- seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
+ seq_printf(m, "%3ld) %8d %5d ", i, stack_trace_index[i], size);
trace_lookup_stack(m, i);
@@ -453,7 +459,7 @@ static __init int stack_trace_init(void)
return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
- &max_stack_size, &stack_max_size_fops);
+ &stack_trace_max_size, &stack_max_size_fops);
trace_create_file("stack_trace", 0444, d_tracer,
NULL, &stack_trace_fops);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7d567a4b9fa7..0655afbea83f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -110,6 +110,7 @@ static enum print_line_t
print_syscall_enter(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
+ struct trace_array *tr = iter->tr;
struct trace_seq *s = &iter->seq;
struct trace_entry *ent = iter->ent;
struct syscall_trace_enter *trace;
@@ -136,7 +137,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags,
goto end;
/* parameter types */
- if (trace_flags & TRACE_ITER_VERBOSE)
+ if (tr->trace_flags & TRACE_ITER_VERBOSE)
trace_seq_printf(s, "%s ", entry->types[i]);
/* parameter values */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 3490407dc7b7..ecd536de603a 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -91,11 +91,13 @@ static void debug_print_probes(struct tracepoint_func *funcs)
printk(KERN_DEBUG "Probe %d : %p\n", i, funcs[i].func);
}
-static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
- struct tracepoint_func *tp_func)
+static struct tracepoint_func *
+func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
+ int prio)
{
- int nr_probes = 0;
struct tracepoint_func *old, *new;
+ int nr_probes = 0;
+ int pos = -1;
if (WARN_ON(!tp_func->func))
return ERR_PTR(-EINVAL);
@@ -104,18 +106,33 @@ static struct tracepoint_func *func_add(struct tracepoint_func **funcs,
old = *funcs;
if (old) {
/* (N -> N+1), (N != 0, 1) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++)
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ /* Insert before probes of lower priority */
+ if (pos < 0 && old[nr_probes].prio < prio)
+ pos = nr_probes;
if (old[nr_probes].func == tp_func->func &&
old[nr_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
+ }
}
/* + 2 : one for new probe, one for NULL func */
new = allocate_probes(nr_probes + 2);
if (new == NULL)
return ERR_PTR(-ENOMEM);
- if (old)
- memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
- new[nr_probes] = *tp_func;
+ if (old) {
+ if (pos < 0) {
+ pos = nr_probes;
+ memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
+ } else {
+ /* Copy higher priority probes ahead of the new probe */
+ memcpy(new, old, pos * sizeof(struct tracepoint_func));
+ /* Copy the rest after it. */
+ memcpy(new + pos + 1, old + pos,
+ (nr_probes - pos) * sizeof(struct tracepoint_func));
+ }
+ } else
+ pos = 0;
+ new[pos] = *tp_func;
new[nr_probes + 1].func = NULL;
*funcs = new;
debug_print_probes(*funcs);
@@ -174,7 +191,7 @@ static void *func_remove(struct tracepoint_func **funcs,
* Add the probe function to a tracepoint.
*/
static int tracepoint_add_func(struct tracepoint *tp,
- struct tracepoint_func *func)
+ struct tracepoint_func *func, int prio)
{
struct tracepoint_func *old, *tp_funcs;
@@ -183,7 +200,7 @@ static int tracepoint_add_func(struct tracepoint *tp,
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
- old = func_add(&tp_funcs, func);
+ old = func_add(&tp_funcs, func, prio);
if (IS_ERR(old)) {
WARN_ON_ONCE(1);
return PTR_ERR(old);
@@ -240,6 +257,7 @@ static int tracepoint_remove_func(struct tracepoint *tp,
* @tp: tracepoint
* @probe: probe handler
* @data: tracepoint data
+ * @prio: priority of this function over other registered functions
*
* Returns 0 if ok, error value on error.
* Note: if @tp is within a module, the caller is responsible for
@@ -247,7 +265,8 @@ static int tracepoint_remove_func(struct tracepoint *tp,
* performed either with a tracepoint module going notifier, or from
* within module exit functions.
*/
-int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
+int tracepoint_probe_register_prio(struct tracepoint *tp, void *probe,
+ void *data, int prio)
{
struct tracepoint_func tp_func;
int ret;
@@ -255,10 +274,30 @@ int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
mutex_lock(&tracepoints_mutex);
tp_func.func = probe;
tp_func.data = data;
- ret = tracepoint_add_func(tp, &tp_func);
+ tp_func.prio = prio;
+ ret = tracepoint_add_func(tp, &tp_func, prio);
mutex_unlock(&tracepoints_mutex);
return ret;
}
+EXPORT_SYMBOL_GPL(tracepoint_probe_register_prio);
+
+/**
+ * tracepoint_probe_register - Connect a probe to a tracepoint
+ * @tp: tracepoint
+ * @probe: probe handler
+ * @data: tracepoint data
+ * @prio: priority of this function over other registered functions
+ *
+ * Returns 0 if ok, error value on error.
+ * Note: if @tp is within a module, the caller is responsible for
+ * unregistering the probe before the module is gone. This can be
+ * performed either with a tracepoint module going notifier, or from
+ * within module exit functions.
+ */
+int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data)
+{
+ return tracepoint_probe_register_prio(tp, probe, data, TRACEPOINT_DEFAULT_PRIO);
+}
EXPORT_SYMBOL_GPL(tracepoint_probe_register);
/**
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 64ed1c37bd1f..18f34cf75f74 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -57,8 +57,10 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#else
#define sysctl_softlockup_all_cpu_backtrace 0
+#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -110,8 +112,9 @@ static unsigned long soft_lockup_nmi_warn;
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic =
+unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -173,6 +176,13 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+static int __init hardlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_hardlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif
/*
@@ -263,15 +273,15 @@ void touch_softlockup_watchdog_sync(void)
#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static int is_hardlockup(void)
+static bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return 1;
+ return true;
__this_cpu_write(hrtimer_interrupts_saved, hrint);
- return 0;
+ return false;
}
#endif
@@ -279,7 +289,7 @@ static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
- if (watchdog_enabled & SOFT_WATCHDOG_ENABLED) {
+ if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){
/* Warn about unreasonable delays. */
if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts;
@@ -318,17 +328,30 @@ static void watchdog_overflow_callback(struct perf_event *event,
*/
if (is_hardlockup()) {
int this_cpu = smp_processor_id();
+ struct pt_regs *regs = get_irq_regs();
/* only print hardlockups once */
if (__this_cpu_read(hard_watchdog_warn) == true)
return;
- if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d",
- this_cpu);
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
- this_cpu);
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ panic("Hard LOCKUP");
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -347,6 +370,9 @@ static void watchdog_interrupt_count(void)
static int watchdog_nmi_enable(unsigned int cpu);
static void watchdog_nmi_disable(unsigned int cpu);
+static int watchdog_enable_all_cpus(void);
+static void watchdog_disable_all_cpus(void);
+
/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
@@ -651,37 +677,41 @@ static struct smp_hotplug_thread watchdog_threads = {
/*
* park all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function returns an error if kthread_park() of a watchdog thread
+ * fails. In this situation, the watchdog threads of some CPUs can already
+ * be parked and the watchdog threads of other CPUs can still be runnable.
+ * Callers are expected to handle this special condition as appropriate in
+ * their context.
+ *
+ * This function may only be called in a context that is protected against
+ * races with CPU hotplug - for example, via get_online_cpus().
*/
static int watchdog_park_threads(void)
{
int cpu, ret = 0;
- get_online_cpus();
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
- if (ret) {
- for_each_watchdog_cpu(cpu)
- kthread_unpark(per_cpu(softlockup_watchdog, cpu));
- }
- put_online_cpus();
return ret;
}
/*
* unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ *
+ * This function may only be called in a context that is protected against
+ * races with CPU hotplug - for example, via get_online_cpus().
*/
static void watchdog_unpark_threads(void)
{
int cpu;
- get_online_cpus();
for_each_watchdog_cpu(cpu)
kthread_unpark(per_cpu(softlockup_watchdog, cpu));
- put_online_cpus();
}
/*
@@ -691,6 +721,7 @@ int lockup_detector_suspend(void)
{
int ret = 0;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
/*
* Multiple suspend requests can be active in parallel (counted by
@@ -704,6 +735,11 @@ int lockup_detector_suspend(void)
if (ret == 0)
watchdog_suspended++;
+ else {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to suspend lockup detectors, disabled\n");
+ watchdog_enabled = 0;
+ }
mutex_unlock(&watchdog_proc_mutex);
@@ -726,12 +762,20 @@ void lockup_detector_resume(void)
watchdog_unpark_threads();
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
}
-static void update_watchdog_all_cpus(void)
+static int update_watchdog_all_cpus(void)
{
- watchdog_park_threads();
+ int ret;
+
+ ret = watchdog_park_threads();
+ if (ret)
+ return ret;
+
watchdog_unpark_threads();
+
+ return 0;
}
static int watchdog_enable_all_cpus(void)
@@ -750,15 +794,20 @@ static int watchdog_enable_all_cpus(void)
* Enable/disable the lockup detectors or
* change the sample period 'on the fly'.
*/
- update_watchdog_all_cpus();
+ err = update_watchdog_all_cpus();
+
+ if (err) {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to update lockup detectors, disabled\n");
+ }
}
+ if (err)
+ watchdog_enabled = 0;
+
return err;
}
-/* prepare/enable/disable routines */
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
static void watchdog_disable_all_cpus(void)
{
if (watchdog_running) {
@@ -767,6 +816,8 @@ static void watchdog_disable_all_cpus(void)
}
}
+#ifdef CONFIG_SYSCTL
+
/*
* Update the run state of the lockup detectors.
*/
@@ -808,6 +859,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
int err, old, new;
int *watchdog_param = (int *)table->data;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
if (watchdog_suspended) {
@@ -849,15 +901,17 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
} while (cmpxchg(&watchdog_enabled, old, new) != old);
/*
- * Update the run state of the lockup detectors.
- * Restore 'watchdog_enabled' on failure.
+ * Update the run state of the lockup detectors. There is _no_
+ * need to check the value returned by proc_watchdog_update()
+ * and to restore the previous value of 'watchdog_enabled' as
+ * both lockup detectors are disabled if proc_watchdog_update()
+ * returns an error.
*/
err = proc_watchdog_update();
- if (err)
- watchdog_enabled = old;
}
out:
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
return err;
}
@@ -899,6 +953,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
{
int err, old;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
if (watchdog_suspended) {
@@ -914,15 +969,17 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
goto out;
/*
- * Update the sample period.
- * Restore 'watchdog_thresh' on failure.
+ * Update the sample period. Restore on failure.
*/
set_sample_period();
err = proc_watchdog_update();
- if (err)
+ if (err) {
watchdog_thresh = old;
+ set_sample_period();
+ }
out:
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
return err;
}
@@ -937,6 +994,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
{
int err;
+ get_online_cpus();
mutex_lock(&watchdog_proc_mutex);
if (watchdog_suspended) {
@@ -964,6 +1022,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
}
out:
mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
return err;
}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bcb14cafe007..c579dbab2e36 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3199,6 +3199,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
u32 hash = wqattrs_hash(attrs);
struct worker_pool *pool;
int node;
+ int target_node = NUMA_NO_NODE;
lockdep_assert_held(&wq_pool_mutex);
@@ -3210,13 +3211,25 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
}
}
+ /* if cpumask is contained inside a NUMA node, we belong to that node */
+ if (wq_numa_enabled) {
+ for_each_node(node) {
+ if (cpumask_subset(attrs->cpumask,
+ wq_numa_possible_cpumask[node])) {
+ target_node = node;
+ break;
+ }
+ }
+ }
+
/* nope, create a new one */
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, target_node);
if (!pool || init_worker_pool(pool) < 0)
goto fail;
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
+ pool->node = target_node;
/*
* no_numa isn't a worker_pool attribute, always clear it. See
@@ -3224,17 +3237,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
*/
pool->attrs->no_numa = false;
- /* if cpumask is contained inside a NUMA node, we belong to that node */
- if (wq_numa_enabled) {
- for_each_node(node) {
- if (cpumask_subset(pool->attrs->cpumask,
- wq_numa_possible_cpumask[node])) {
- pool->node = node;
- break;
- }
- }
- }
-
if (worker_pool_assign_id(pool) < 0)
goto fail;