diff options
Diffstat (limited to 'kernel')
68 files changed, 3538 insertions, 2369 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 7c9b0a585502..b8d4cd8ac0b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,7 +58,6 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o -obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o @@ -95,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/audit.c b/kernel/audit.c index defc2e6f1e3b..5feed232be9d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -855,18 +855,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; } case AUDIT_SIGNAL_INFO: - err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); - if (err) - return err; + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); if (!sig_data) { - security_release_secctx(ctx, len); + if (audit_sig_sid) + security_release_secctx(ctx, len); return -ENOMEM; } sig_data->uid = audit_sig_uid; sig_data->pid = audit_sig_pid; - memcpy(sig_data->ctx, ctx, len); - security_release_secctx(ctx, len); + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 0, 0, sig_data, sizeof(*sig_data) + len); kfree(sig_data); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 0e96dbc60ea9..cc7e87936cbc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -45,8 +45,8 @@ struct audit_watch { atomic_t count; /* reference count */ - char *path; /* insertion path */ dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ unsigned long ino; /* associated inode number */ struct audit_parent *parent; /* associated parent */ struct list_head wlist; /* entry in parent->watches list */ diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 68d3c6a0ecd6..267e484f0198 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -168,12 +168,12 @@ struct audit_context { int in_syscall; /* 1 if task is in a syscall */ enum audit_state state, current_state; unsigned int serial; /* serial number for record */ - struct timespec ctime; /* time of syscall entry */ int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ - int return_valid; /* return code is valid */ long return_code;/* syscall return code */ u64 prio; + int return_valid; /* return code is valid */ int name_count; struct audit_names names[AUDIT_NAMES]; char * filterkey; /* key for rule that triggered record */ @@ -198,8 +198,8 @@ struct audit_context { char target_comm[TASK_COMM_LEN]; struct audit_tree_refs *trees, *first_trees; - int tree_count; struct list_head killed_trees; + int tree_count; int type; union { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c7ece8f027f2..ca83b73fba19 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -48,6 +49,8 @@ #include <linux/namei.h> #include <linux/smp_lock.h> #include <linux/pid_namespace.h> +#include <linux/idr.h> +#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <asm/atomic.h> @@ -60,6 +63,8 @@ static struct cgroup_subsys *subsys[] = { #include <linux/cgroup_subsys.h> }; +#define MAX_CGROUP_ROOT_NAMELEN 64 + /* * A cgroupfs_root represents the root of a cgroup hierarchy, * and may be associated with a superblock to form an active @@ -74,6 +79,9 @@ struct cgroupfs_root { */ unsigned long subsys_bits; + /* Unique id for this hierarchy. */ + int hierarchy_id; + /* The bitmask of subsystems currently attached to this hierarchy */ unsigned long actual_subsys_bits; @@ -94,6 +102,9 @@ struct cgroupfs_root { /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; }; /* @@ -141,6 +152,10 @@ struct css_id { static LIST_HEAD(roots); static int root_count; +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + /* dummytop is a shorthand for the dummy hierarchy's top cgroup */ #define dummytop (&rootnode.top_cgroup) @@ -201,6 +216,7 @@ struct cg_cgroup_link { * cgroup, anchored on cgroup->css_sets */ struct list_head cgrp_link_list; + struct cgroup *cgrp; /* * List running through cg_cgroup_links pointing at a * single css_set object, anchored on css_set->cg_links @@ -227,8 +243,11 @@ static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); static DEFINE_RWLOCK(css_set_lock); static int css_set_count; -/* hash table for cgroup groups. This improves the performance to - * find an existing css_set */ +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ #define CSS_SET_HASH_BITS 7 #define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; @@ -248,48 +267,22 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + kfree(cg); +} + /* We don't maintain the lists running through each css_set to its * task until after the first call to cgroup_iter_start(). This * reduces the fork()/exit() overhead for people who have cgroups * compiled into their kernel but not actually in use */ static int use_task_css_set_links __read_mostly; -/* When we create or destroy a css_set, the operation simply - * takes/releases a reference count on all the cgroups referenced - * by subsystems in this css_set. This can end up multiple-counting - * some cgroups, but that's OK - the ref-count is just a - * busy/not-busy indicator; ensuring that we only count each cgroup - * once would require taking a global lock to ensure that no - * subsystems moved between hierarchies while we were doing so. - * - * Possible TODO: decide at boot time based on the number of - * registered subsystems and the number of CPUs or NUMA nodes whether - * it's better for performance to ref-count every subsystem, or to - * take a global lock and only add one ref count to each hierarchy. - */ - -/* - * unlink a css_set from the list and free it - */ -static void unlink_css_set(struct css_set *cg) +static void __put_css_set(struct css_set *cg, int taskexit) { struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - - hlist_del(&cg->hlist); - css_set_count--; - - list_for_each_entry_safe(link, saved_link, &cg->cg_links, - cg_link_list) { - list_del(&link->cg_link_list); - list_del(&link->cgrp_link_list); - kfree(link); - } -} - -static void __put_css_set(struct css_set *cg, int taskexit) -{ - int i; /* * Ensure that the refcount doesn't hit zero while any readers * can see it. Similar to atomic_dec_and_lock(), but for an @@ -302,21 +295,28 @@ static void __put_css_set(struct css_set *cg, int taskexit) write_unlock(&css_set_lock); return; } - unlink_css_set(cg); - write_unlock(&css_set_lock); - rcu_read_lock(); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup); + /* This css_set is dead. unlink it and release cgroup refcounts */ + hlist_del(&cg->hlist); + css_set_count--; + + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); if (atomic_dec_and_test(&cgrp->count) && notify_on_release(cgrp)) { if (taskexit) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } + + kfree(link); } - rcu_read_unlock(); - kfree(cg); + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } /* @@ -338,6 +338,78 @@ static inline void put_css_set_taskexit(struct css_set *cg) } /* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* * find_existing_css_set() is a helper for * find_css_set(), and checks to see whether an existing * css_set is suitable. @@ -378,10 +450,11 @@ static struct css_set *find_existing_css_set( hhead = css_set_hash(template); hlist_for_each_entry(cg, node, hhead, hlist) { - if (!memcmp(template, cg->subsys, sizeof(cg->subsys))) { - /* All subsystems matched */ - return cg; - } + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; } /* No existing cgroup group matched */ @@ -435,8 +508,14 @@ static void link_css_set(struct list_head *tmp_cg_links, link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, cgrp_link_list); link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); list_move(&link->cgrp_link_list, &cgrp->css_sets); - list_add(&link->cg_link_list, &cg->cg_links); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); } /* @@ -451,11 +530,11 @@ static struct css_set *find_css_set( { struct css_set *res; struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; - int i; struct list_head tmp_cg_links; struct hlist_head *hhead; + struct cg_cgroup_link *link; /* First see if we already have a cgroup group that matches * the desired set */ @@ -489,20 +568,12 @@ static struct css_set *find_css_set( write_lock(&css_set_lock); /* Add reference counts and links from the new css_set. */ - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup *cgrp = res->subsys[i]->cgroup; - struct cgroup_subsys *ss = subsys[i]; - atomic_inc(&cgrp->count); - /* - * We want to add a link once per cgroup, so we - * only do it for the first subsystem in each - * hierarchy - */ - if (ss->root->subsys_list.next == &ss->sibling) - link_css_set(&tmp_cg_links, res, cgrp); + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); } - if (list_empty(&rootnode.subsys_list)) - link_css_set(&tmp_cg_links, res, dummytop); BUG_ON(!list_empty(&tmp_cg_links)); @@ -518,6 +589,41 @@ static struct css_set *find_css_set( } /* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* * There is one global cgroup mutex. We also require taking * task_lock() when dereferencing a task's cgroup subsys pointers. * See "The task_lock() exception", at the end of this comment. @@ -596,8 +702,8 @@ void cgroup_unlock(void) static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); static int cgroup_populate_dir(struct cgroup *cgrp); -static struct inode_operations cgroup_dir_inode_operations; -static struct file_operations proc_cgroupstats_operations; +static const struct inode_operations cgroup_dir_inode_operations; +static const struct file_operations proc_cgroupstats_operations; static struct backing_dev_info cgroup_backing_dev_info = { .name = "cgroup", @@ -677,6 +783,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) */ deactivate_super(cgrp->root->sb); + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + call_rcu(&cgrp->rcu_head, free_cgroup_rcu); } iput(inode); @@ -841,6 +953,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",noprefix"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); mutex_unlock(&cgroup_mutex); return 0; } @@ -849,6 +963,12 @@ struct cgroup_sb_opts { unsigned long subsys_bits; unsigned long flags; char *release_agent; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + }; /* Convert a hierarchy specifier into a bitmask of subsystems and @@ -863,9 +983,7 @@ static int parse_cgroupfs_options(char *data, mask = ~(1UL << cpuset_subsys_id); #endif - opts->subsys_bits = 0; - opts->flags = 0; - opts->release_agent = NULL; + memset(opts, 0, sizeof(*opts)); while ((token = strsep(&o, ",")) != NULL) { if (!*token) @@ -879,17 +997,42 @@ static int parse_cgroupfs_options(char *data, if (!ss->disabled) opts->subsys_bits |= 1ul << i; } + } else if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; } else if (!strcmp(token, "noprefix")) { set_bit(ROOT_NOPREFIX, &opts->flags); } else if (!strncmp(token, "release_agent=", 14)) { /* Specifying two release agents is forbidden */ if (opts->release_agent) return -EINVAL; - opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL); + opts->release_agent = + kstrndup(token + 14, PATH_MAX, GFP_KERNEL); if (!opts->release_agent) return -ENOMEM; - strncpy(opts->release_agent, token + 14, PATH_MAX - 1); - opts->release_agent[PATH_MAX - 1] = 0; + } else if (!strncmp(token, "name=", 5)) { + int i; + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; } else { struct cgroup_subsys *ss; int i; @@ -906,6 +1049,8 @@ static int parse_cgroupfs_options(char *data, } } + /* Consistency checks */ + /* * Option noprefix was introduced just for backward compatibility * with the old cpuset, so we allow noprefix only if mounting just @@ -915,8 +1060,16 @@ static int parse_cgroupfs_options(char *data, (opts->subsys_bits & mask)) return -EINVAL; - /* We can't have an empty hierarchy */ - if (!opts->subsys_bits) + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) return -EINVAL; return 0; @@ -944,6 +1097,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* Don't allow name to change at remount */ + if (opts.name && strcmp(opts.name, root->name)) { + ret = -EINVAL; + goto out_unlock; + } + ret = rebind_subsystems(root, opts.subsys_bits); if (ret) goto out_unlock; @@ -955,13 +1114,14 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) strcpy(root->release_agent_path, opts.release_agent); out_unlock: kfree(opts.release_agent); + kfree(opts.name); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); unlock_kernel(); return ret; } -static struct super_operations cgroup_ops = { +static const struct super_operations cgroup_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = cgroup_show_options, @@ -974,9 +1134,10 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); - INIT_LIST_HEAD(&cgrp->pids_list); - init_rwsem(&cgrp->pids_mutex); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); } + static void init_cgroup_root(struct cgroupfs_root *root) { struct cgroup *cgrp = &root->top_cgroup; @@ -988,33 +1149,106 @@ static void init_cgroup_root(struct cgroupfs_root *root) init_cgroup_housekeeping(cgrp); } +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + static int cgroup_test_super(struct super_block *sb, void *data) { - struct cgroupfs_root *new = data; + struct cgroup_sb_opts *opts = data; struct cgroupfs_root *root = sb->s_fs_info; - /* First check subsystems */ - if (new->subsys_bits != root->subsys_bits) - return 0; + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; - /* Next check flags */ - if (new->flags != root->flags) + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) return 0; return 1; } +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + static int cgroup_set_super(struct super_block *sb, void *data) { int ret; - struct cgroupfs_root *root = data; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); ret = set_anon_super(sb, NULL); if (ret) return ret; - sb->s_fs_info = root; - root->sb = sb; + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; sb->s_blocksize = PAGE_CACHE_SIZE; sb->s_blocksize_bits = PAGE_CACHE_SHIFT; @@ -1051,48 +1285,43 @@ static int cgroup_get_sb(struct file_system_type *fs_type, void *data, struct vfsmount *mnt) { struct cgroup_sb_opts opts; + struct cgroupfs_root *root; int ret = 0; struct super_block *sb; - struct cgroupfs_root *root; - struct list_head tmp_cg_links; + struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); - if (ret) { - kfree(opts.release_agent); - return ret; - } - - root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) { - kfree(opts.release_agent); - return -ENOMEM; - } + if (ret) + goto out_err; - init_cgroup_root(root); - root->subsys_bits = opts.subsys_bits; - root->flags = opts.flags; - if (opts.release_agent) { - strcpy(root->release_agent_path, opts.release_agent); - kfree(opts.release_agent); + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto out_err; } + opts.new_root = new_root; - sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root); - + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); if (IS_ERR(sb)) { - kfree(root); - return PTR_ERR(sb); + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto out_err; } - if (sb->s_fs_info != root) { - /* Reusing an existing superblock */ - BUG_ON(sb->s_root == NULL); - kfree(root); - root = NULL; - } else { - /* New superblock */ + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; struct cgroup *root_cgrp = &root->top_cgroup; struct inode *inode; + struct cgroupfs_root *existing_root; int i; BUG_ON(sb->s_root != NULL); @@ -1105,6 +1334,18 @@ static int cgroup_get_sb(struct file_system_type *fs_type, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + /* * We're accessing css_set_count without locking * css_set_lock here, but that's OK - it can only be @@ -1123,7 +1364,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (ret == -EBUSY) { mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); - goto free_cg_links; + free_cg_links(&tmp_cg_links); + goto drop_new_super; } /* EBUSY should be the only error here */ @@ -1155,17 +1397,27 @@ static int cgroup_get_sb(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cgroup_populate_dir(root_cgrp); - mutex_unlock(&inode->i_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); } simple_set_mnt(mnt, sb); + kfree(opts.release_agent); + kfree(opts.name); return 0; - free_cg_links: - free_cg_links(&tmp_cg_links); drop_new_super: deactivate_locked_super(sb); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ret; } @@ -1211,7 +1463,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_mutex); kill_litter_super(sb); - kfree(root); + cgroup_drop_root(root); } static struct file_system_type cgroup_fs_type = { @@ -1276,27 +1528,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } -/* - * Return the first subsystem attached to a cgroup's hierarchy, and - * its subsystem id. - */ - -static void get_first_subsys(const struct cgroup *cgrp, - struct cgroup_subsys_state **css, int *subsys_id) -{ - const struct cgroupfs_root *root = cgrp->root; - const struct cgroup_subsys *test_ss; - BUG_ON(list_empty(&root->subsys_list)); - test_ss = list_entry(root->subsys_list.next, - struct cgroup_subsys, sibling); - if (css) { - *css = cgrp->subsys[test_ss->subsys_id]; - BUG_ON(!*css); - } - if (subsys_id) - *subsys_id = test_ss->subsys_id; -} - /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1313,18 +1544,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct css_set *cg; struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; - int subsys_id; - - get_first_subsys(cgrp, NULL, &subsys_id); /* Nothing to do if the task is already in that cgroup */ - oldcgrp = task_cgroup(tsk, subsys_id); + oldcgrp = task_cgroup_from_root(tsk, root); if (cgrp == oldcgrp) return 0; for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk); + retval = ss->can_attach(ss, cgrp, tsk, false); if (retval) return retval; } @@ -1362,7 +1590,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk); + ss->attach(ss, cgrp, oldcgrp, tsk, false); } set_bit(CGRP_RELEASABLE, &oldcgrp->flags); synchronize_rcu(); @@ -1423,15 +1651,6 @@ static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) return ret; } -/* The various types of files and directories in a cgroup file system */ -enum cgroup_filetype { - FILE_ROOT, - FILE_DIR, - FILE_TASKLIST, - FILE_NOTIFY_ON_RELEASE, - FILE_RELEASE_AGENT, -}; - /** * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. * @cgrp: the cgroup to be checked for liveness @@ -1644,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file) return single_release(inode, file); } -static struct file_operations cgroup_seqfile_operations = { +static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, @@ -1703,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, return simple_rename(old_dir, old_dentry, new_dir, new_dentry); } -static struct file_operations cgroup_file_operations = { +static const struct file_operations cgroup_file_operations = { .read = cgroup_file_read, .write = cgroup_file_write, .llseek = generic_file_llseek, @@ -1711,7 +1930,7 @@ static struct file_operations cgroup_file_operations = { .release = cgroup_file_release, }; -static struct inode_operations cgroup_dir_inode_operations = { +static const struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, .rmdir = cgroup_rmdir, @@ -1876,7 +2095,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * the start of a css_set */ static void cgroup_advance_iter(struct cgroup *cgrp, - struct cgroup_iter *it) + struct cgroup_iter *it) { struct list_head *l = it->cg_link; struct cg_cgroup_link *link; @@ -2129,7 +2348,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) } /* - * Stuff for reading the 'tasks' file. + * Stuff for reading the 'tasks'/'procs' files. * * Reading this file can return large amounts of data if a cgroup has * *lots* of attached tasks. So it may need several calls to read(), @@ -2139,27 +2358,196 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ /* - * Load into 'pidarray' up to 'npids' of the tasks using cgroup - * 'cgrp'. Return actual number of pids loaded. No need to - * task_lock(p) when reading out p->cgroup, since we're in an RCU - * read section, so the css_set can't go away, and is - * immutable after creation. + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. */ -static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) { - int n = 0, pid; + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* found a matching list - drop the extra refcount */ + put_pid_ns(ns); + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + l->use_count++; + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + put_pid_ns(ns); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = ns; + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ struct cgroup_iter it; struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ cgroup_iter_start(cgrp, &it); while ((tsk = cgroup_iter_next(cgrp, &it))) { - if (unlikely(n == npids)) + if (unlikely(n == length)) break; - pid = task_pid_vnr(tsk); - if (pid > 0) - pidarray[n++] = pid; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; } cgroup_iter_end(cgrp, &it); - return n; + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; } /** @@ -2216,37 +2604,14 @@ err: return ret; } -/* - * Cache pids for all threads in the same pid namespace that are - * opening the same "tasks" file. - */ -struct cgroup_pids { - /* The node in cgrp->pids_list */ - struct list_head list; - /* The cgroup those pids belong to */ - struct cgroup *cgrp; - /* The namepsace those pids belong to */ - struct pid_namespace *ns; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the this tasks_pids array */ - int use_count; - /* Length of the current tasks_pids array */ - int length; -}; - -static int cmppid(const void *a, const void *b) -{ - return *(pid_t *)a - *(pid_t *)b; -} /* - * seq_file methods for the "tasks" file. The seq_file position is the + * seq_file methods for the tasks/procs files. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid - * in the cgroup->tasks_pids array. + * in the cgroup->l->list array. */ -static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) { /* * Initially we receive a position value that corresponds to @@ -2254,48 +2619,45 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; + struct cgroup_pidlist *l = s->private; int index = 0, pid = *pos; int *iter; - down_read(&cgrp->pids_mutex); + down_read(&l->mutex); if (pid) { - int end = cp->length; + int end = l->length; while (index < end) { int mid = (index + end) / 2; - if (cp->tasks_pids[mid] == pid) { + if (l->list[mid] == pid) { index = mid; break; - } else if (cp->tasks_pids[mid] <= pid) + } else if (l->list[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cp->length) + if (index >= l->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cp->tasks_pids + index; + iter = l->list + index; *pos = *iter; return iter; } -static void cgroup_tasks_stop(struct seq_file *s, void *v) +static void cgroup_pidlist_stop(struct seq_file *s, void *v) { - struct cgroup_pids *cp = s->private; - struct cgroup *cgrp = cp->cgrp; - up_read(&cgrp->pids_mutex); + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); } -static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup_pids *cp = s->private; - int *p = v; - int *end = cp->tasks_pids + cp->length; - + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; /* * Advance to the next pid in the array. If this goes off the * end, we're done @@ -2309,124 +2671,107 @@ static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) } } -static int cgroup_tasks_show(struct seq_file *s, void *v) +static int cgroup_pidlist_show(struct seq_file *s, void *v) { return seq_printf(s, "%d\n", *(int *)v); } -static struct seq_operations cgroup_tasks_seq_operations = { - .start = cgroup_tasks_start, - .stop = cgroup_tasks_stop, - .next = cgroup_tasks_next, - .show = cgroup_tasks_show, +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, }; -static void release_cgroup_pid_array(struct cgroup_pids *cp) +static void cgroup_release_pid_array(struct cgroup_pidlist *l) { - struct cgroup *cgrp = cp->cgrp; - - down_write(&cgrp->pids_mutex); - BUG_ON(!cp->use_count); - if (!--cp->use_count) { - list_del(&cp->list); - put_pid_ns(cp->ns); - kfree(cp->tasks_pids); - kfree(cp); + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; } - up_write(&cgrp->pids_mutex); + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); } -static int cgroup_tasks_release(struct inode *inode, struct file *file) +static int cgroup_pidlist_release(struct inode *inode, struct file *file) { - struct seq_file *seq; - struct cgroup_pids *cp; - + struct cgroup_pidlist *l; if (!(file->f_mode & FMODE_READ)) return 0; - - seq = file->private_data; - cp = seq->private; - - release_cgroup_pid_array(cp); + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); return seq_release(inode, file); } -static struct file_operations cgroup_tasks_operations = { +static const struct file_operations cgroup_pidlist_operations = { .read = seq_read, .llseek = seq_lseek, .write = cgroup_file_write, - .release = cgroup_tasks_release, + .release = cgroup_pidlist_release, }; /* - * Handle an open on 'tasks' file. Prepare an array containing the - * process id's of tasks currently attached to the cgroup being opened. + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. */ - -static int cgroup_tasks_open(struct inode *unused, struct file *file) +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - struct pid_namespace *ns = current->nsproxy->pid_ns; - struct cgroup_pids *cp; - pid_t *pidarray; - int npids; + struct cgroup_pidlist *l; int retval; /* Nothing to do for write-only files */ if (!(file->f_mode & FMODE_READ)) return 0; - /* - * If cgroup gets more users after we read count, we won't have - * enough space - tough. This race is indistinguishable to the - * caller from the case that the additional cgroup users didn't - * show up until sometime later on. - */ - npids = cgroup_task_count(cgrp); - pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); - if (!pidarray) - return -ENOMEM; - npids = pid_array_load(pidarray, npids, cgrp); - sort(pidarray, npids, sizeof(pid_t), cmppid, NULL); - - /* - * Store the array in the cgroup, freeing the old - * array if necessary - */ - down_write(&cgrp->pids_mutex); - - list_for_each_entry(cp, &cgrp->pids_list, list) { - if (ns == cp->ns) - goto found; - } - - cp = kzalloc(sizeof(*cp), GFP_KERNEL); - if (!cp) { - up_write(&cgrp->pids_mutex); - kfree(pidarray); - return -ENOMEM; - } - cp->cgrp = cgrp; - cp->ns = ns; - get_pid_ns(ns); - list_add(&cp->list, &cgrp->pids_list); -found: - kfree(cp->tasks_pids); - cp->tasks_pids = pidarray; - cp->length = npids; - cp->use_count++; - up_write(&cgrp->pids_mutex); - - file->f_op = &cgroup_tasks_operations; + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; - retval = seq_open(file, &cgroup_tasks_seq_operations); + retval = seq_open(file, &cgroup_pidlist_seq_operations); if (retval) { - release_cgroup_pid_array(cp); + cgroup_release_pid_array(l); return retval; } - ((struct seq_file *)file->private_data)->private = cp; + ((struct seq_file *)file->private_data)->private = l; return 0; } +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, struct cftype *cft) @@ -2449,21 +2794,27 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, /* * for the common functions, 'private' gives the type of file */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." static struct cftype files[] = { { .name = "tasks", .open = cgroup_tasks_open, .write_u64 = cgroup_tasks_write, - .release = cgroup_tasks_release, - .private = FILE_TASKLIST, + .release = cgroup_pidlist_release, .mode = S_IRUGO | S_IWUSR, }, - + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + /* .write_u64 = cgroup_procs_write, TODO */ + .release = cgroup_pidlist_release, + .mode = S_IRUGO, + }, { .name = "notify_on_release", .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, - .private = FILE_NOTIFY_ON_RELEASE, }, }; @@ -2472,7 +2823,6 @@ static struct cftype cft_release_agent = { .read_seq_string = cgroup_release_agent_show, .write_string = cgroup_release_agent_write, .max_write_len = PATH_MAX, - .private = FILE_RELEASE_AGENT, }; static int cgroup_populate_dir(struct cgroup *cgrp) @@ -2879,6 +3229,7 @@ int __init cgroup_init_early(void) init_task.cgroups = &init_css_set; init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; list_add(&init_css_set_link.cgrp_link_list, &rootnode.top_cgroup.css_sets); list_add(&init_css_set_link.cg_link_list, @@ -2933,7 +3284,7 @@ int __init cgroup_init(void) /* Add init_css_set to the hash table */ hhead = css_set_hash(init_css_set.subsys); hlist_add_head(&init_css_set.hlist, hhead); - + BUG_ON(!init_root_id(&rootnode)); err = register_filesystem(&cgroup_fs_type); if (err < 0) goto out; @@ -2986,15 +3337,16 @@ static int proc_cgroup_show(struct seq_file *m, void *v) for_each_active_root(root) { struct cgroup_subsys *ss; struct cgroup *cgrp; - int subsys_id; int count = 0; - seq_printf(m, "%lu:", root->subsys_bits); + seq_printf(m, "%d:", root->hierarchy_id); for_each_subsys(root, ss) seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); seq_putc(m, ':'); - get_first_subsys(&root->top_cgroup, NULL, &subsys_id); - cgrp = task_cgroup(tsk, subsys_id); + cgrp = task_cgroup_from_root(tsk, root); retval = cgroup_path(cgrp, buf, PAGE_SIZE); if (retval < 0) goto out_unlock; @@ -3017,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroup_show, pid); } -struct file_operations proc_cgroup_operations = { +const struct file_operations proc_cgroup_operations = { .open = cgroup_open, .read = seq_read, .llseek = seq_lseek, @@ -3033,8 +3385,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; - seq_printf(m, "%s\t%lu\t%d\t%d\n", - ss->name, ss->root->subsys_bits, + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); } mutex_unlock(&cgroup_mutex); @@ -3046,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file) return single_open(file, proc_cgroupstats_show, NULL); } -static struct file_operations proc_cgroupstats_operations = { +static const struct file_operations proc_cgroupstats_operations = { .open = cgroupstats_open, .read = seq_read, .llseek = seq_lseek, @@ -3320,13 +3672,11 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) { int ret; struct cgroup *target; - int subsys_id; if (cgrp == dummytop) return 1; - get_first_subsys(cgrp, NULL, &subsys_id); - target = task_cgroup(task, subsys_id); + target = task_cgroup_from_root(task, cgrp->root); while (cgrp != target && cgrp!= cgrp->top_cgroup) cgrp = cgrp->parent; ret = (cgrp == target); @@ -3358,8 +3708,10 @@ static void check_for_release(struct cgroup *cgrp) void __css_put(struct cgroup_subsys_state *css) { struct cgroup *cgrp = css->cgroup; + int val; rcu_read_lock(); - if (atomic_dec_return(&css->refcnt) == 1) { + val = atomic_dec_return(&css->refcnt); + if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); @@ -3367,6 +3719,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); + WARN_ON_ONCE(val < 1); } /* @@ -3693,3 +4046,154 @@ css_get_next(struct cgroup_subsys *ss, int id, return ret; } +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c deleted file mode 100644 index 0c92d797baa6..000000000000 --- a/kernel/cgroup_debug.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * kernel/cgroup_debug.c - Example cgroup subsystem that - * exposes debug info - * - * Copyright (C) Google Inc, 2007 - * - * Developed by Paul Menage (menage@google.com) - * - */ - -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/slab.h> -#include <linux/rcupdate.h> - -#include <asm/atomic.h> - -static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, - struct cgroup *cont) -{ - struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); - - if (!css) - return ERR_PTR(-ENOMEM); - - return css; -} - -static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) -{ - kfree(cont->subsys[debug_subsys_id]); -} - -static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) -{ - return atomic_read(&cont->count); -} - -static u64 taskcount_read(struct cgroup *cont, struct cftype *cft) -{ - u64 count; - - count = cgroup_task_count(cont); - return count; -} - -static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) -{ - return (u64)(long)current->cgroups; -} - -static u64 current_css_set_refcount_read(struct cgroup *cont, - struct cftype *cft) -{ - u64 count; - - rcu_read_lock(); - count = atomic_read(¤t->cgroups->refcount); - rcu_read_unlock(); - return count; -} - -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - -static struct cftype files[] = { - { - .name = "cgroup_refcount", - .read_u64 = cgroup_refcount_read, - }, - { - .name = "taskcount", - .read_u64 = taskcount_read, - }, - - { - .name = "current_css_set", - .read_u64 = current_css_set_read, - }, - - { - .name = "current_css_set_refcount", - .read_u64 = current_css_set_refcount_read, - }, - - { - .name = "releasable", - .read_u64 = releasable_read, - }, -}; - -static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -{ - return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); -} - -struct cgroup_subsys debug_subsys = { - .name = "debug", - .create = debug_create, - .destroy = debug_destroy, - .populate = debug_populate, - .subsys_id = debug_subsys_id, -}; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index fb249e2bcada..59e9ef6aab40 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -159,7 +159,7 @@ static bool is_task_frozen_enough(struct task_struct *task) */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task) + struct task_struct *task, bool threadgroup) { struct freezer *freezer; @@ -177,6 +177,19 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state == CGROUP_FROZEN) return -EBUSY; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (is_task_frozen_enough(c)) { + rcu_read_unlock(); + return -EBUSY; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7e75a41bd508..b5cb469d2545 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1324,9 +1324,10 @@ static int fmeter_getrate(struct fmeter *fmp) static cpumask_var_t cpus_attach; /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct task_struct *tsk) +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk, bool threadgroup) { + int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1343,18 +1344,51 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - return security_task_setscheduler(tsk, 0, NULL); + ret = security_task_setscheduler(tsk, 0, NULL); + if (ret) + return ret; + if (threadgroup) { + struct task_struct *c; + + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + ret = security_task_setscheduler(c, 0, NULL); + if (ret) { + rcu_read_unlock(); + return ret; + } + } + rcu_read_unlock(); + } + return 0; +} + +static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, + struct cpuset *cs) +{ + int err; + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + task_lock(tsk); + cpuset_change_task_nodemask(tsk, to); + task_unlock(tsk); + cpuset_update_task_spread_flag(cs, tsk); + } -static void cpuset_attach(struct cgroup_subsys *ss, - struct cgroup *cont, struct cgroup *oldcont, - struct task_struct *tsk) +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk, + bool threadgroup) { nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - int err; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); @@ -1363,15 +1397,19 @@ static void cpuset_attach(struct cgroup_subsys *ss, guarantee_online_cpus(cs, cpus_attach); guarantee_online_mems(cs, &to); } - err = set_cpus_allowed_ptr(tsk, cpus_attach); - if (err) - return; - task_lock(tsk); - cpuset_change_task_nodemask(tsk, &to); - task_unlock(tsk); - cpuset_update_task_spread_flag(cs, tsk); + /* do per-task migration stuff possibly for each in the threadgroup */ + cpuset_attach_task(tsk, &to, cs); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + cpuset_attach_task(c, &to, cs); + } + rcu_read_unlock(); + } + /* change mm; only needs to be done once even if threadgroup */ from = oldcs->mems_allowed; to = cs->mems_allowed; mm = get_task_mm(tsk); diff --git a/kernel/cred.c b/kernel/cred.c index d7f7a01082eb..dd76cfe5f5b0 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -782,6 +782,25 @@ EXPORT_SYMBOL(set_create_files_as); #ifdef CONFIG_DEBUG_CREDENTIALS +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; + if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) + return true; +#ifdef CONFIG_SECURITY_SELINUX + if (selinux_is_enabled()) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + /* * dump invalid credentials */ diff --git a/kernel/exit.c b/kernel/exit.c index bc2b1fdfc354..e61891f80123 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include <linux/tracehook.h> #include <linux/fs_struct.h> #include <linux/init_task.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <trace/events/sched.h> #include <asm/uaccess.h> @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -359,8 +359,10 @@ void __set_special_pids(struct pid *pid) { struct task_struct *curr = current->group_leader; - if (task_session(curr) != pid) + if (task_session(curr) != pid) { change_pid(curr, PIDTYPE_SID, pid); + proc_sid_connector(curr); + } if (task_pgrp(curr) != pid) change_pid(curr, PIDTYPE_PGID, pid); @@ -945,6 +947,8 @@ NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); } acct_collect(code, group_dead); if (group_dead) @@ -972,8 +976,6 @@ NORET_TYPE void do_exit(long code) disassociate_ctty(1); module_put(task_thread_info(tsk)->exec_domain->module); - if (tsk->binfmt) - module_put(tsk->binfmt->module); proc_exit_connector(tsk); @@ -981,7 +983,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA @@ -1091,28 +1093,28 @@ struct wait_opts { int __user *wo_stat; struct rusage __user *wo_rusage; + wait_queue_t child_wait; int notask_error; }; -static struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { - struct pid *pid = NULL; - if (type == PIDTYPE_PID) - pid = task->pids[type].pid; - else if (type < PIDTYPE_MAX) - pid = task->group_leader->pids[type].pid; - return pid; + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) { - int err; - - if (wo->wo_type < PIDTYPE_MAX) { - if (task_pid_type(p, wo->wo_type) != wo->wo_pid) - return 0; - } + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; /* Wait for all children (clone and not) if __WALL is set; * otherwise, wait for clone children *only* if __WCLONE is * set; otherwise, wait for non-clone children *only*. (Note: @@ -1122,10 +1124,6 @@ static int eligible_child(struct wait_opts *wo, struct task_struct *p) && !(wo->wo_flags & __WALL)) return 0; - err = security_task_wait(p); - if (err) - return err; - return 1; } @@ -1138,18 +1136,20 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, put_task_struct(p); infop = wo->wo_info; - if (!retval) - retval = put_user(SIGCHLD, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user((short)why, &infop->si_code); - if (!retval) - retval = put_user(pid, &infop->si_pid); - if (!retval) - retval = put_user(uid, &infop->si_uid); - if (!retval) - retval = put_user(status, &infop->si_status); + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } if (!retval) retval = pid; return retval; @@ -1206,6 +1206,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (likely(!traced) && likely(!task_detached(p))) { struct signal_struct *psig; struct signal_struct *sig; + unsigned long maxrss; /* * The resource counters for the group leader are in its @@ -1254,6 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) psig->coublock += task_io_get_oublock(p) + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); spin_unlock_irq(&p->real_parent->sighand->siglock); @@ -1475,13 +1479,14 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. */ -static int wait_consider_task(struct wait_opts *wo, struct task_struct *parent, - int ptrace, struct task_struct *p) +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) { int ret = eligible_child(wo, p); if (!ret) return ret; + ret = security_task_wait(p); if (unlikely(ret < 0)) { /* * If we have not yet seen any eligible child, @@ -1543,7 +1548,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) * Do not consider detached threads. */ if (!task_detached(p)) { - int ret = wait_consider_task(wo, tsk, 0, p); + int ret = wait_consider_task(wo, 0, p); if (ret) return ret; } @@ -1557,7 +1562,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) struct task_struct *p; list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { - int ret = wait_consider_task(wo, tsk, 1, p); + int ret = wait_consider_task(wo, 1, p); if (ret) return ret; } @@ -1565,15 +1570,38 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) return 0; } +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + static long do_wait(struct wait_opts *wo) { - DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; int retval; trace_sched_process_wait(wo->wo_pid); - add_wait_queue(¤t->signal->wait_chldexit,&wait); + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* * If there is nothing that can match our critiera just get out. @@ -1614,32 +1642,7 @@ notask: } end: __set_current_state(TASK_RUNNING); - remove_wait_queue(¤t->signal->wait_chldexit,&wait); - if (wo->wo_info) { - struct siginfo __user *infop = wo->wo_info; - - if (retval > 0) - retval = 0; - else { - /* - * For a WNOHANG return, clear out all the fields - * we would set so the user can easily tell the - * difference. - */ - if (!retval) - retval = put_user(0, &infop->si_signo); - if (!retval) - retval = put_user(0, &infop->si_errno); - if (!retval) - retval = put_user(0, &infop->si_code); - if (!retval) - retval = put_user(0, &infop->si_pid); - if (!retval) - retval = put_user(0, &infop->si_uid); - if (!retval) - retval = put_user(0, &infop->si_status); - } - } + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); return retval; } @@ -1684,6 +1687,29 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, wo.wo_stat = NULL; wo.wo_rusage = ru; ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + put_pid(pid); /* avoid REGPARM breakage on x86: */ diff --git a/kernel/fork.c b/kernel/fork.c index 341965b0ab1c..4c20fff8c13a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -49,6 +49,7 @@ #include <linux/ftrace.h> #include <linux/profile.h> #include <linux/rmap.h> +#include <linux/ksm.h> #include <linux/acct.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> @@ -61,7 +62,8 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> +#include <linux/posix-timers.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -136,9 +138,17 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + void free_task(struct task_struct *tsk) { prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); free_thread_info(tsk->stack); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); @@ -253,6 +263,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + return tsk; out: @@ -288,6 +301,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { struct file *file; @@ -418,22 +434,30 @@ __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); - mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); set_mm_counter(mm, anon_rss, 0); spin_lock_init(&mm->page_table_lock); - spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; + mm_init_aio(mm); mm_init_owner(mm, p); if (likely(!mm_alloc_pgd(mm))) { @@ -485,6 +509,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); + ksm_exit(mm); exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { @@ -493,6 +518,8 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); } put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); mmdrop(mm); } } @@ -624,9 +651,14 @@ struct mm_struct *dup_mm(struct task_struct *tsk) mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + return mm; free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; mmput(mm); fail_nomem: @@ -794,10 +826,10 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) thread_group_cputime_init(sig); /* Expiration times and increments. */ - sig->it_virt_expires = cputime_zero; - sig->it_virt_incr = cputime_zero; - sig->it_prof_expires = cputime_zero; - sig->it_prof_incr = cputime_zero; + sig->it[CPUCLOCK_PROF].expires = cputime_zero; + sig->it[CPUCLOCK_PROF].incr = cputime_zero; + sig->it[CPUCLOCK_VIRT].expires = cputime_zero; + sig->it[CPUCLOCK_VIRT].incr = cputime_zero; /* Cached expiration times. */ sig->cputime_expires.prof_exp = cputime_zero; @@ -855,6 +887,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; + sig->maxrss = sig->cmaxrss = 0; task_io_accounting_init(&sig->ioac); sig->sum_sched_runtime = 0; taskstats_tgid_init(sig); @@ -869,6 +902,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); + sig->oom_adj = current->signal->oom_adj; + return 0; } @@ -964,6 +999,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + retval = security_task_create(clone_flags); if (retval) goto fork_out; @@ -1005,9 +1050,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (!try_module_get(task_thread_info(p)->exec_domain->module)) goto bad_fork_cleanup_count; - if (p->binfmt && !try_module_get(p->binfmt->module)) - goto bad_fork_cleanup_put_domain; - p->did_exec = 0; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); @@ -1081,10 +1123,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->bts = NULL; + p->stack_start = stack_start; + /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1259,7 +1303,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1286,16 +1330,13 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); - if (p->binfmt) - module_put(p->binfmt->module); -bad_fork_cleanup_put_domain: module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/futex.c b/kernel/futex.c index c3bb2fce11ba..4949d336d88d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1656,6 +1656,12 @@ out: static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ set_current_state(TASK_INTERRUPTIBLE); queue_me(q, hb); diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 654efd09f6a9..70a298d6da71 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on S390 || X86 || (PPC && EXPERIMENTAL) + depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE default n ---help--- This options activates profiling for the entire kernel. diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c03f221fee44..3e1c36e7998f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -48,6 +48,8 @@ #include <asm/uaccess.h> +#include <trace/events/timer.h> + /* * The timer bases: * @@ -442,6 +444,26 @@ static inline void debug_hrtimer_activate(struct hrtimer *timer) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -487,13 +509,14 @@ static inline int hrtimer_hres_active(void) * next event * Called with interrupts disabled and base->lock held */ -static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { int i; struct hrtimer_clock_base *base = cpu_base->clock_base; - ktime_t expires; + ktime_t expires, expires_next; - cpu_base->expires_next.tv64 = KTIME_MAX; + expires_next.tv64 = KTIME_MAX; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { struct hrtimer *timer; @@ -509,10 +532,15 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base) */ if (expires.tv64 < 0) expires.tv64 = 0; - if (expires.tv64 < cpu_base->expires_next.tv64) - cpu_base->expires_next = expires; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; } + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + if (cpu_base->expires_next.tv64 != KTIME_MAX) tick_program_event(cpu_base->expires_next, 1); } @@ -595,7 +623,7 @@ static void retrigger_next_event(void *arg) base->clock_base[CLOCK_REALTIME].offset = timespec_to_ktime(realtime_offset); - hrtimer_force_reprogram(base); + hrtimer_force_reprogram(base, 0); spin_unlock(&base->lock); } @@ -698,8 +726,6 @@ static int hrtimer_switch_to_hres(void) /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); local_irq_restore(flags); - printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", - smp_processor_id()); return 1; } @@ -708,7 +734,8 @@ static int hrtimer_switch_to_hres(void) static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } -static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, struct hrtimer_clock_base *base, int wakeup) @@ -798,7 +825,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer *entry; int leftmost = 1; - debug_hrtimer_activate(timer); + debug_activate(timer); /* * Find the right place in the rbtree: @@ -851,19 +878,29 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - if (timer->state & HRTIMER_STATE_ENQUEUED) { - /* - * Remove the timer from the rbtree and replace the - * first entry pointer if necessary. - */ - if (base->first == &timer->node) { - base->first = rb_next(&timer->node); - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) - hrtimer_force_reprogram(base->cpu_base); + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + /* + * Remove the timer from the rbtree and replace the first + * entry pointer if necessary. + */ + if (base->first == &timer->node) { + base->first = rb_next(&timer->node); +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); } - rb_erase(&timer->node, &base->active); +#endif } + rb_erase(&timer->node, &base->active); +out: timer->state = newstate; } @@ -884,7 +921,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, @@ -1117,7 +1154,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { - debug_hrtimer_init(timer); + debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); @@ -1141,7 +1178,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) } EXPORT_SYMBOL_GPL(hrtimer_get_res); -static void __run_hrtimer(struct hrtimer *timer) +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) { struct hrtimer_clock_base *base = timer->base; struct hrtimer_cpu_base *cpu_base = base->cpu_base; @@ -1150,7 +1187,7 @@ static void __run_hrtimer(struct hrtimer *timer) WARN_ON(!irqs_disabled()); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1161,7 +1198,9 @@ static void __run_hrtimer(struct hrtimer *timer) * the timer base. */ spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); restart = fn(timer); + trace_hrtimer_expire_exit(timer); spin_lock(&cpu_base->lock); /* @@ -1272,7 +1311,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) break; } - __run_hrtimer(timer); + __run_hrtimer(timer, &basenow); } base++; } @@ -1394,7 +1433,7 @@ void hrtimer_run_queues(void) hrtimer_get_expires_tv64(timer)) break; - __run_hrtimer(timer); + __run_hrtimer(timer, &base->softirq_time); } spin_unlock(&cpu_base->lock); } @@ -1571,7 +1610,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, while ((node = rb_first(&old_base->active))) { timer = rb_entry(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); - debug_hrtimer_deactivate(timer); + debug_deactivate(timer); /* * Mark it as STATE_MIGRATE not INACTIVE otherwise the diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a4927b785..d4e841747400 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/itimer.c b/kernel/itimer.c index 58762f7077ec..b03451ede528 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/posix-timers.h> #include <linux/hrtimer.h> +#include <trace/events/timer.h> #include <asm/uaccess.h> @@ -41,10 +42,43 @@ static struct timeval itimer_get_remtime(struct hrtimer *timer) return ktime_to_timeval(rem); } +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + int do_getitimer(int which, struct itimerval *value) { struct task_struct *tsk = current; - cputime_t cinterval, cval; switch (which) { case ITIMER_REAL: @@ -55,44 +89,10 @@ int do_getitimer(int which, struct itimerval *value) spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime cputime; - cputime_t utime; - - thread_group_cputimer(tsk, &cputime); - utime = cputime.utime; - if (cputime_le(cval, utime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, utime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); break; case ITIMER_PROF: - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero)) { - struct task_cputime times; - cputime_t ptime; - - thread_group_cputimer(tsk, ×); - ptime = cputime_add(times.utime, times.stime); - if (cputime_le(cval, ptime)) { /* about to fire */ - cval = jiffies_to_cputime(1); - } else { - cval = cputime_sub(cval, ptime); - } - } - spin_unlock_irq(&tsk->sighand->siglock); - cputime_to_timeval(cval, &value->it_value); - cputime_to_timeval(cinterval, &value->it_interval); + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); break; default: return(-EINVAL); @@ -123,11 +123,62 @@ enum hrtimer_restart it_real_fn(struct hrtimer *timer) struct signal_struct *sig = container_of(timer, struct signal_struct, real_timer); + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); return HRTIMER_NORESTART; } +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); + it->error = cputime_sub_ns(nval, ns_nval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + /* * Returns true if the timeval is in canonical form */ @@ -139,7 +190,6 @@ int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) struct task_struct *tsk = current; struct hrtimer *timer; ktime_t expires; - cputime_t cval, cinterval, nval, ninterval; /* * Validate the timevals in value. @@ -171,51 +221,14 @@ again: } else tsk->signal->it_real_incr.tv64 = 0; + trace_itimer_state(ITIMER_REAL, value, 0); spin_unlock_irq(&tsk->sighand->siglock); break; case ITIMER_VIRTUAL: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_virt_expires; - cinterval = tsk->signal->it_virt_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_VIRT, - &nval, &cval); - } - tsk->signal->it_virt_expires = nval; - tsk->signal->it_virt_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); break; case ITIMER_PROF: - nval = timeval_to_cputime(&value->it_value); - ninterval = timeval_to_cputime(&value->it_interval); - spin_lock_irq(&tsk->sighand->siglock); - cval = tsk->signal->it_prof_expires; - cinterval = tsk->signal->it_prof_incr; - if (!cputime_eq(cval, cputime_zero) || - !cputime_eq(nval, cputime_zero)) { - if (cputime_gt(nval, cputime_zero)) - nval = cputime_add(nval, - jiffies_to_cputime(1)); - set_process_cpu_timer(tsk, CPUCLOCK_PROF, - &nval, &cval); - } - tsk->signal->it_prof_expires = nval; - tsk->signal->it_prof_incr = ninterval; - spin_unlock_irq(&tsk->sighand->siglock); - if (ovalue) { - cputime_to_timeval(cval, &ovalue->it_value); - cputime_to_timeval(cinterval, &ovalue->it_interval); - } + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); break; default: return -EINVAL; diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 3a29dbe7898e..8b6b8b697c68 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -59,7 +59,8 @@ static inline int is_kernel_inittext(unsigned long addr) static inline int is_kernel_text(unsigned long addr) { - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) return 1; return in_gate_area_no_task(addr); } diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ef177d653b2c..5240d75f4c60 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1321,7 +1321,7 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) return 0; } -static struct seq_operations kprobes_seq_ops = { +static const struct seq_operations kprobes_seq_ops = { .start = kprobe_seq_start, .next = kprobe_seq_next, .stop = kprobe_seq_stop, @@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp) return seq_open(filp, &kprobes_seq_ops); } -static struct file_operations debugfs_kprobes_operations = { +static const struct file_operations debugfs_kprobes_operations = { .open = kprobes_open, .read = seq_read, .llseek = seq_lseek, @@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file, return count; } -static struct file_operations fops_kp = { +static const struct file_operations fops_kp = { .read = read_enabled_file_bool, .write = write_enabled_file_bool, }; diff --git a/kernel/lockdep.c b/kernel/lockdep.c index f74d2d7aa605..3815ac1d58b2 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -578,6 +578,9 @@ static int static_obj(void *obj) if ((addr >= start) && (addr < end)) return 1; + if (arch_is_kernel_data(addr)) + return 1; + #ifdef CONFIG_SMP /* * percpu var? diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index d4b3dbc79fdb..d4aba4f3584c 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -594,7 +594,7 @@ static int ls_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations lockstat_ops = { +static const struct seq_operations lockstat_ops = { .start = ls_start, .next = ls_next, .stop = ls_stop, diff --git a/kernel/module.c b/kernel/module.c index b6ee424245dd..8b7d8805819d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -47,6 +47,7 @@ #include <linux/rculist.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> +#include <asm/mmu_context.h> #include <linux/license.h> #include <asm/sections.h> #include <linux/tracepoint.h> @@ -1535,6 +1536,10 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif } void *__symbol_get(const char *symbol) @@ -1792,6 +1797,17 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs, } } +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + #ifdef CONFIG_KALLSYMS /* lookup symbol in given range of kernel_symbols */ @@ -1857,13 +1873,93 @@ static char elf_type(const Elf_Sym *sym, return '?'; } +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + unsigned long symoffs; + Elf_Shdr *symsect = sechdrs + symindex; + Elf_Shdr *strsect = sechdrs + strindex; + const Elf_Sym *src; + const char *strtab; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + symsect->sh_name); + + src = (void *)hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + strtab = (void *)hdr + strsect->sh_offset; + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, sechdrs, hdr->e_shnum)) { + unsigned int j = src->st_name; + + while(!__test_and_set_bit(j, strmap) && strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strindex) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + *pstroffs = mod->core_size; + __set_bit(0, strmap); + mod->core_size += bitmap_weight(strmap, strsect->sh_size); + + return symoffs; +} + static void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + unsigned long *strmap) { - unsigned int i; + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; mod->symtab = (void *)sechdrs[symindex].sh_addr; mod->num_symtab = sechdrs[symindex].sh_size / sizeof(Elf_Sym); @@ -1873,13 +1969,46 @@ static void add_kallsyms(struct module *mod, for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], sechdrs, secstrings, mod); + + mod->core_symtab = dst = mod->module_core + symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, sechdrs, shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(strmap, dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + stroffs; + for (*s = 0, i = 1; i < sechdrs[strindex].sh_size; ++i) + if (test_bit(i, strmap)) + *++s = mod->strtab[i]; } #else +static inline unsigned long layout_symtab(struct module *mod, + Elf_Shdr *sechdrs, + unsigned int symindex, + unsigned int strindex, + const Elf_Ehdr *hdr, + const char *secstrings, + unsigned long *pstroffs, + unsigned long *strmap) +{ + return 0; +} + static inline void add_kallsyms(struct module *mod, Elf_Shdr *sechdrs, + unsigned int shnum, unsigned int symindex, unsigned int strindex, - const char *secstrings) + unsigned long symoffs, + unsigned long stroffs, + const char *secstrings, + const unsigned long *strmap) { } #endif /* CONFIG_KALLSYMS */ @@ -1954,6 +2083,8 @@ static noinline struct module *load_module(void __user *umod, struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ + unsigned long symoffs, stroffs, *strmap; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2035,11 +2166,6 @@ static noinline struct module *load_module(void __user *umod, /* Don't keep modinfo and version sections. */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; sechdrs[versindex].sh_flags &= ~(unsigned long)SHF_ALLOC; -#ifdef CONFIG_KALLSYMS - /* Keep symbol and string tables for decoding later. */ - sechdrs[symindex].sh_flags |= SHF_ALLOC; - sechdrs[strindex].sh_flags |= SHF_ALLOC; -#endif /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -2075,6 +2201,13 @@ static noinline struct module *load_module(void __user *umod, goto free_hdr; } + strmap = kzalloc(BITS_TO_LONGS(sechdrs[strindex].sh_size) + * sizeof(long), GFP_KERNEL); + if (!strmap) { + err = -ENOMEM; + goto free_mod; + } + if (find_module(mod->name)) { err = -EEXIST; goto free_mod; @@ -2104,6 +2237,8 @@ static noinline struct module *load_module(void __user *umod, this is done generically; there doesn't appear to be any special cases for the architectures. */ layout_sections(mod, hdr, sechdrs, secstrings); + symoffs = layout_symtab(mod, sechdrs, symindex, strindex, hdr, + secstrings, &stroffs, strmap); /* Do the allocs. */ ptr = module_alloc_update_bounds(mod->core_size); @@ -2308,7 +2443,10 @@ static noinline struct module *load_module(void __user *umod, percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, sechdrs[pcpuindex].sh_size); - add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, + symoffs, stroffs, secstrings, strmap); + kfree(strmap); + strmap = NULL; if (!mod->taints) { struct _ddebug *debug; @@ -2380,13 +2518,14 @@ static noinline struct module *load_module(void __user *umod, synchronize_sched(); module_arch_cleanup(mod); cleanup: + free_modinfo(mod); kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - free_init: percpu_modfree(mod->refptr); + free_init: #endif module_free(mod, mod->module_init); free_core: @@ -2397,6 +2536,7 @@ static noinline struct module *load_module(void __user *umod, percpu_modfree(percpu); free_mod: kfree(args); + kfree(strmap); free_hdr: vfree(hdr); return ERR_PTR(err); @@ -2486,6 +2626,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, /* Drop initial reference. */ module_put(mod); trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; @@ -2947,7 +3092,6 @@ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, struct kernel_symbol *ks, - struct marker *marker, struct tracepoint *tp) { } diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c index 5aa854f9e5ae..2a5dfec8efe0 100644 --- a/kernel/ns_cgroup.c +++ b/kernel/ns_cgroup.c @@ -42,8 +42,8 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid) * (hence either you are in the same cgroup as task, or in an * ancestor cgroup thereof) */ -static int ns_can_attach(struct cgroup_subsys *ss, - struct cgroup *new_cgroup, struct task_struct *task) +static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, + struct task_struct *task, bool threadgroup) { if (current != task) { if (!capable(CAP_SYS_ADMIN)) @@ -56,6 +56,18 @@ static int ns_can_attach(struct cgroup_subsys *ss, if (!cgroup_is_descendant(new_cgroup, task)) return -EPERM; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &task->thread_group, thread_group) { + if (!cgroup_is_descendant(new_cgroup, c)) { + rcu_read_unlock(); + return -EPERM; + } + } + rcu_read_unlock(); + } + return 0; } diff --git a/kernel/panic.c b/kernel/panic.c index bc4dcb6a389b..96b45d0b4ba5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -178,7 +178,7 @@ static const struct tnt tnts[] = { * 'W' - Taint on warning. * 'C' - modules from drivers/staging are loaded. * - * The string is overwritten by the next call to print_taint(). + * The string is overwritten by the next call to print_tainted(). */ const char *print_tainted(void) { diff --git a/kernel/params.c b/kernel/params.c index 7f6912ced2ba..9da58eabdcb2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -23,6 +23,7 @@ #include <linux/device.h> #include <linux/err.h> #include <linux/slab.h> +#include <linux/ctype.h> #if 0 #define DEBUGP printk @@ -87,7 +88,7 @@ static char *next_arg(char *args, char **param, char **val) } for (i = 0; args[i]; i++) { - if (args[i] == ' ' && !in_quote) + if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') @@ -121,7 +122,7 @@ static char *next_arg(char *args, char **param, char **val) next = args + i; /* Chew up trailing spaces. */ - while (*next == ' ') + while (isspace(*next)) next++; return next; } @@ -138,7 +139,7 @@ int parse_args(const char *name, DEBUGP("Parsing ARGS: %s\n", args); /* Chew leading spaces */ - while (*args == ' ') + while (isspace(*args)) args++; while (*args) { diff --git a/kernel/perf_counter.c b/kernel/perf_event.c index cc768ab81ac8..9d0b5c665883 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_event.c @@ -1,12 +1,12 @@ /* - * Performance counter core code + * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * - * For licensing details see kernel-base/COPYING + * For licensing details see kernel-base/COPYING */ #include <linux/fs.h> @@ -20,72 +20,73 @@ #include <linux/percpu.h> #include <linux/ptrace.h> #include <linux/vmstat.h> +#include <linux/vmalloc.h> #include <linux/hardirq.h> #include <linux/rculist.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/anon_inodes.h> #include <linux/kernel_stat.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/irq_regs.h> /* - * Each CPU has a list of per CPU counters: + * Each CPU has a list of per CPU events: */ DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); -int perf_max_counters __read_mostly = 1; +int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; /* - * perf counter paranoia level: + * perf event paranoia level: * -1 - not paranoid at all * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv + * 1 - disallow cpu events for unpriv * 2 - disallow kernel profiling for unpriv */ -int sysctl_perf_counter_paranoid __read_mostly = 1; +int sysctl_perf_event_paranoid __read_mostly = 1; static inline bool perf_paranoid_tracepoint_raw(void) { - return sysctl_perf_counter_paranoid > -1; + return sysctl_perf_event_paranoid > -1; } static inline bool perf_paranoid_cpu(void) { - return sysctl_perf_counter_paranoid > 0; + return sysctl_perf_event_paranoid > 0; } static inline bool perf_paranoid_kernel(void) { - return sysctl_perf_counter_paranoid > 1; + return sysctl_perf_event_paranoid > 1; } -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* - * max perf counter sample rate + * max perf event sample rate */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; +int sysctl_perf_event_sample_rate __read_mostly = 100000; -static atomic64_t perf_counter_id; +static atomic64_t perf_event_id; /* - * Lock for (sysadmin-configurable) counter reservations: + * Lock for (sysadmin-configurable) event reservations: */ static DEFINE_SPINLOCK(perf_resource_lock); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) { return NULL; } @@ -93,18 +94,18 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, +hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) + struct perf_event_context *ctx, int cpu) { return 0; } -void __weak perf_counter_print_debug(void) { } +void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); @@ -130,20 +131,20 @@ void perf_enable(void) hw_perf_enable(); } -static void get_ctx(struct perf_counter_context *ctx) +static void get_ctx(struct perf_event_context *ctx) { WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } static void free_ctx(struct rcu_head *head) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; - ctx = container_of(head, struct perf_counter_context, rcu_head); + ctx = container_of(head, struct perf_event_context, rcu_head); kfree(ctx); } -static void put_ctx(struct perf_counter_context *ctx) +static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) @@ -154,7 +155,7 @@ static void put_ctx(struct perf_counter_context *ctx) } } -static void unclone_ctx(struct perf_counter_context *ctx) +static void unclone_ctx(struct perf_event_context *ctx) { if (ctx->parent_ctx) { put_ctx(ctx->parent_ctx); @@ -163,37 +164,37 @@ static void unclone_ctx(struct perf_counter_context *ctx) } /* - * If we inherit counters we want to return the parent counter id + * If we inherit events we want to return the parent event id * to userspace. */ -static u64 primary_counter_id(struct perf_counter *counter) +static u64 primary_event_id(struct perf_event *event) { - u64 id = counter->id; + u64 id = event->id; - if (counter->parent) - id = counter->parent->id; + if (event->parent) + id = event->parent->id; return id; } /* - * Get the perf_counter_context for a task and lock it. + * Get the perf_event_context for a task and lock it. * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ -static struct perf_counter_context * +static struct perf_event_context * perf_lock_task_context(struct task_struct *task, unsigned long *flags) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; rcu_read_lock(); retry: - ctx = rcu_dereference(task->perf_counter_ctxp); + ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* * If this context is a clone of another, it might * get swapped for another underneath us by - * perf_counter_task_sched_out, though the + * perf_event_task_sched_out, though the * rcu_read_lock() protects us from any context * getting freed. Lock the context and check if it * got swapped before we could get the lock, and retry @@ -201,7 +202,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped on us any more. */ spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { + if (ctx != rcu_dereference(task->perf_event_ctxp)) { spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } @@ -220,9 +221,9 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; unsigned long flags; ctx = perf_lock_task_context(task, &flags); @@ -233,7 +234,7 @@ static struct perf_counter_context *perf_pin_task_context(struct task_struct *ta return ctx; } -static void perf_unpin_context(struct perf_counter_context *ctx) +static void perf_unpin_context(struct perf_event_context *ctx) { unsigned long flags; @@ -244,123 +245,122 @@ static void perf_unpin_context(struct perf_counter_context *ctx) } /* - * Add a counter from the lists for its context. + * Add a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_counter *group_leader = counter->group_leader; + struct perf_event *group_leader = event->group_leader; /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group * leader's sibling list: */ - if (group_leader == counter) - list_add_tail(&counter->list_entry, &ctx->counter_list); + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); else { - list_add_tail(&counter->list_entry, &group_leader->sibling_list); + list_add_tail(&event->group_entry, &group_leader->sibling_list); group_leader->nr_siblings++; } - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) ctx->nr_stat++; } /* - * Remove a counter from the lists for its context. + * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. */ static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_counter *sibling, *tmp; + struct perf_event *sibling, *tmp; - if (list_empty(&counter->list_entry)) + if (list_empty(&event->group_entry)) return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) + ctx->nr_events--; + if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&counter->list_entry); - list_del_rcu(&counter->event_entry); + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; + if (event->group_leader != event) + event->group_leader->nr_siblings--; /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them * to the context list directly: */ - list_for_each_entry_safe(sibling, tmp, - &counter->sibling_list, list_entry) { + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - list_move_tail(&sibling->list_entry, &ctx->counter_list); + list_move_tail(&sibling->group_entry, &ctx->group_list); sibling->group_leader = sibling; } } static void -counter_sched_out(struct perf_counter *counter, +event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) + struct perf_event_context *ctx) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE) return; - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; - if (!is_software_counter(counter)) + if (!is_software_event(event)) cpuctx->active_oncpu--; ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) + if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; } static void -group_sched_out(struct perf_counter *group_counter, +group_sched_out(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) + struct perf_event_context *ctx) { - struct perf_counter *counter; + struct perf_event *event; - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + if (group_event->state != PERF_EVENT_STATE_ACTIVE) return; - counter_sched_out(group_counter, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); /* * Schedule out siblings (if any): */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); - if (group_counter->attr.exclusive) + if (group_event->attr.exclusive) cpuctx->exclusive = 0; } /* - * Cross CPU call to remove a performance counter + * Cross CPU call to remove a performance event * - * We disable the counter on the hardware level first. After that we + * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static void __perf_counter_remove_from_context(void *info) +static void __perf_event_remove_from_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; /* * If this is a task context, we need to check whether it is @@ -373,22 +373,22 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the - * counters on a global level. + * events on a global level. */ perf_disable(); - counter_sched_out(counter, cpuctx, ctx); + event_sched_out(event, cpuctx, ctx); - list_del_counter(counter, ctx); + list_del_event(event, ctx); if (!ctx->task) { /* - * Allow more per task counters with respect to the + * Allow more per task events with respect to the * reservation: */ cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); } perf_enable(); @@ -397,56 +397,56 @@ static void __perf_counter_remove_from_context(void *info) /* - * Remove the counter from a task's (or a CPU's) list of counters. + * Remove the event from a task's (or a CPU's) list of events. * * Must be called with ctx->mutex held. * - * CPU counters are removed with a smp call. For task counters we only + * CPU events are removed with a smp call. For task events we only * call when the task is on a CPU. * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the + * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_counter_remove_from_context(struct perf_counter *counter) +static void perf_event_remove_from_context(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; if (!task) { /* - * Per cpu counters are removed via an smp call and + * Per cpu events are removed via an smp call and * the removal is always sucessful. */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); return; } retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ - if (ctx->nr_active && !list_empty(&counter->list_entry)) { + if (ctx->nr_active && !list_empty(&event->group_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not + * can remove the event safely, if the call above did not * succeed. */ - if (!list_empty(&counter->list_entry)) { - list_del_counter(counter, ctx); + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); } spin_unlock_irq(&ctx->lock); } @@ -459,7 +459,7 @@ static inline u64 perf_clock(void) /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_counter_context *ctx) +static void update_context_time(struct perf_event_context *ctx) { u64 now = perf_clock(); @@ -468,51 +468,51 @@ static void update_context_time(struct perf_counter_context *ctx) } /* - * Update the total_time_enabled and total_time_running fields for a counter. + * Update the total_time_enabled and total_time_running fields for a event. */ -static void update_counter_times(struct perf_counter *counter) +static void update_event_times(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; u64 run_end; - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + event->total_time_enabled = ctx->time - event->tstamp_enabled; - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; else run_end = ctx->time; - counter->total_time_running = run_end - counter->tstamp_running; + event->total_time_running = run_end - event->tstamp_running; } /* - * Update total_time_enabled and total_time_running for all counters in a group. + * Update total_time_enabled and total_time_running for all events in a group. */ -static void update_group_times(struct perf_counter *leader) +static void update_group_times(struct perf_event *leader) { - struct perf_counter *counter; + struct perf_event *event; - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, list_entry) - update_counter_times(counter); + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); } /* - * Cross CPU call to disable a performance counter + * Cross CPU call to disable a performance event */ -static void __perf_counter_disable(void *info) +static void __perf_event_disable(void *info) { - struct perf_counter *counter = info; + struct perf_event *event = info; struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. */ if (ctx->task && cpuctx->task_ctx != ctx) return; @@ -520,57 +520,57 @@ static void __perf_counter_disable(void *info) spin_lock(&ctx->lock); /* - * If the counter is on, turn it off. + * If the event is on, turn it off. * If it is in error state, leave it in error state. */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + if (event->state >= PERF_EVENT_STATE_INACTIVE) { update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } spin_unlock(&ctx->lock); } /* - * Disable a counter. + * Disable a event. * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. + * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_counter_disable(struct perf_counter *counter) +static void perf_event_disable(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; if (!task) { /* - * Disable the counter on the cpu that it's on + * Disable the event on the cpu that it's on */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); return; } retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); + task_oncpu_function_call(task, __perf_event_disable, event); spin_lock_irq(&ctx->lock); /* - * If the counter is still active, we need to retry the cross-call. + * If the event is still active, we need to retry the cross-call. */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + if (event->state == PERF_EVENT_STATE_ACTIVE) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -579,73 +579,73 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; } spin_unlock_irq(&ctx->lock); } static int -counter_sched_in(struct perf_counter *counter, +event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, + struct perf_event_context *ctx, int cpu) { - if (counter->state <= PERF_COUNTER_STATE_OFF) + if (event->state <= PERF_EVENT_STATE_OFF) return 0; - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ /* * The new state must be visible before we turn it on in the hardware: */ smp_wmb(); - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; return -EAGAIN; } - counter->tstamp_running += ctx->time - counter->tstamp_stopped; + event->tstamp_running += ctx->time - event->tstamp_stopped; - if (!is_software_counter(counter)) + if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; - if (counter->attr.exclusive) + if (event->attr.exclusive) cpuctx->exclusive = 1; return 0; } static int -group_sched_in(struct perf_counter *group_counter, +group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, + struct perf_event_context *ctx, int cpu) { - struct perf_counter *counter, *partial_group; + struct perf_event *event, *partial_group; int ret; - if (group_counter->state == PERF_COUNTER_STATE_OFF) + if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); if (ret) return ret < 0 ? ret : 0; - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + if (event_sched_in(group_event, cpuctx, ctx, cpu)) return -EAGAIN; /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; goto group_error; } } @@ -657,57 +657,57 @@ group_error: * Groups can be scheduled in as one unit only, so undo any * partial group before returning: */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) break; - counter_sched_out(counter, cpuctx, ctx); + event_sched_out(event, cpuctx, ctx); } - counter_sched_out(group_counter, cpuctx, ctx); + event_sched_out(group_event, cpuctx, ctx); return -EAGAIN; } /* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. */ -static int is_software_only_group(struct perf_counter *leader) +static int is_software_only_group(struct perf_event *leader) { - struct perf_counter *counter; + struct perf_event *event; - if (!is_software_counter(leader)) + if (!is_software_event(leader)) return 0; - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) return 0; return 1; } /* - * Work out whether we can put this counter group on the CPU now. + * Work out whether we can put this event group on the CPU now. */ -static int group_can_go_on(struct perf_counter *counter, +static int group_can_go_on(struct perf_event *event, struct perf_cpu_context *cpuctx, int can_add_hw) { /* - * Groups consisting entirely of software counters can always go on. + * Groups consisting entirely of software events can always go on. */ - if (is_software_only_group(counter)) + if (is_software_only_group(event)) return 1; /* * If an exclusive group is already on, no other hardware - * counters can go on. + * events can go on. */ if (cpuctx->exclusive) return 0; /* * If this group is exclusive and there are already - * counters on the CPU, it can't go on. + * events on the CPU, it can't go on. */ - if (counter->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && cpuctx->active_oncpu) return 0; /* * Otherwise, try to add it if all previous groups were able @@ -716,26 +716,26 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) { - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; } /* - * Cross CPU call to install and enable a performance counter + * Cross CPU call to install and enable a performance event * * Must be called with ctx->mutex held */ static void __perf_install_in_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; int cpu = smp_processor_id(); int err; @@ -744,7 +744,7 @@ static void __perf_install_in_context(void *info) * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. + * on this cpu because it had no events. */ if (ctx->task && cpuctx->task_ctx != ctx) { if (cpuctx->task_ctx || ctx->task != current) @@ -758,41 +758,41 @@ static void __perf_install_in_context(void *info) /* * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. + * events on a global level. NOP for non NMI based events. */ perf_disable(); - add_counter_to_ctx(counter, ctx); + add_event_to_ctx(event, ctx); /* - * Don't put the counter on if it is disabled or if + * Don't put the event on if it is disabled or if * it is in a group and the group isn't on. */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) goto unlock; /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. */ - if (!group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(event, cpuctx, 1)) err = -EEXIST; else - err = counter_sched_in(counter, cpuctx, ctx, cpu); + err = event_sched_in(event, cpuctx, ctx, cpu); if (err) { /* - * This counter couldn't go on. If it is in a group + * This event couldn't go on. If it is in a group * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. + * If the event group is pinned then put it in error state. */ - if (leader != counter) + if (leader != event) group_sched_out(leader, cpuctx, ctx); if (leader->attr.pinned) { update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; + leader->state = PERF_EVENT_STATE_ERROR; } } @@ -806,92 +806,92 @@ static void __perf_install_in_context(void *info) } /* - * Attach a performance counter to a context + * Attach a performance event to a context * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. * - * If the counter is attached to a task which is on a CPU we use a smp + * If the event is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. * * Must be called with ctx->mutex held. */ static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, int cpu) { struct task_struct *task = ctx->task; if (!task) { /* - * Per cpu counters are installed via an smp call and + * Per cpu events are installed via an smp call and * the install is always sucessful. */ smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); + event, 1); return; } retry: task_oncpu_function_call(task, __perf_install_in_context, - counter); + event); spin_lock_irq(&ctx->lock); /* * we need to retry the smp call. */ - if (ctx->is_active && list_empty(&counter->list_entry)) { + if (ctx->is_active && list_empty(&event->group_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not + * can add the event safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) - add_counter_to_ctx(counter, ctx); + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); spin_unlock_irq(&ctx->lock); } /* - * Put a counter into inactive state and update time fields. + * Put a event into inactive state and update time fields. * Enabling the leader of a group effectively enables all * the group members that aren't explicitly disabled, so we * have to update their ->tstamp_enabled also. * Note: this works for group members as well as group leaders * since the non-leader members' sibling_lists will be empty. */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) { - struct perf_counter *sub; + struct perf_event *sub; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, list_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) sub->tstamp_enabled = ctx->time - sub->total_time_enabled; } /* - * Cross CPU call to enable a performance counter + * Cross CPU call to enable a performance event */ -static void __perf_counter_enable(void *info) +static void __perf_event_enable(void *info) { - struct perf_counter *counter = info; + struct perf_event *event = info; struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; int err; /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. */ if (ctx->task && cpuctx->task_ctx != ctx) { if (cpuctx->task_ctx || ctx->task != current) @@ -903,40 +903,40 @@ static void __perf_counter_enable(void *info) ctx->is_active = 1; update_context_time(ctx); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + if (event->state >= PERF_EVENT_STATE_INACTIVE) goto unlock; - __perf_counter_mark_enabled(counter, ctx); + __perf_event_mark_enabled(event, ctx); /* - * If the counter is in a group and isn't the group leader, + * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(counter, cpuctx, 1)) { + if (!group_can_go_on(event, cpuctx, 1)) { err = -EEXIST; } else { perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, smp_processor_id()); else - err = counter_sched_in(counter, cpuctx, ctx, + err = event_sched_in(event, cpuctx, ctx, smp_processor_id()); perf_enable(); } if (err) { /* - * If this counter can't go on and it's part of a + * If this event can't go on and it's part of a * group, then the whole group has to come off. */ - if (leader != counter) + if (leader != event) group_sched_out(leader, cpuctx, ctx); if (leader->attr.pinned) { update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; + leader->state = PERF_EVENT_STATE_ERROR; } } @@ -945,100 +945,96 @@ static void __perf_counter_enable(void *info) } /* - * Enable a counter. + * Enable a event. * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. */ -static void perf_counter_enable(struct perf_counter *counter) +static void perf_event_enable(struct perf_event *event) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; if (!task) { /* - * Enable the counter on the cpu that it's on + * Enable the event on the cpu that it's on */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); return; } spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + if (event->state >= PERF_EVENT_STATE_INACTIVE) goto out; /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we * know that it has gone back into error state, as distinct * from the task having been scheduled away before the * cross-call arrived. */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; retry: spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); + task_oncpu_function_call(task, __perf_event_enable, event); spin_lock_irq(&ctx->lock); /* - * If the context is active and the counter is still off, + * If the context is active and the event is still off, * we need to retry the cross-call. */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) goto retry; /* * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); out: spin_unlock_irq(&ctx->lock); } -static int perf_counter_refresh(struct perf_counter *counter, int refresh) +static int perf_event_refresh(struct perf_event *event, int refresh) { /* - * not supported on inherited counters + * not supported on inherited events */ - if (counter->attr.inherit) + if (event->attr.inherit) return -EINVAL; - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); return 0; } -void __perf_counter_sched_out(struct perf_counter_context *ctx, +void __perf_event_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx) { - struct perf_counter *counter; + struct perf_event *event; spin_lock(&ctx->lock); ctx->is_active = 0; - if (likely(!ctx->nr_counters)) + if (likely(!ctx->nr_events)) goto out; update_context_time(ctx); perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } + if (ctx->nr_active) + list_for_each_entry(event, &ctx->group_list, group_entry) + group_sched_out(event, cpuctx, ctx); + perf_enable(); out: spin_unlock(&ctx->lock); @@ -1047,46 +1043,46 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, /* * Test whether two contexts are equivalent, i.e. whether they * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family + * events via prctl, or enable/disable all events in a family * via ioctl, which will have the same effect on both contexts. */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && ctx1->parent_gen == ctx2->parent_gen && !ctx1->pin_count && !ctx2->pin_count; } -static void __perf_counter_read(void *counter); +static void __perf_event_read(void *event); -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) { u64 value; - if (!counter->attr.inherit_stat) + if (!event->attr.inherit_stat) return; /* - * Update the counter value, we cannot use perf_counter_read() + * Update the event value, we cannot use perf_event_read() * because we're in the middle of a context switch and have IRQs * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we + * we know the event must be on the current CPU, therefore we * don't need to use it. */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); break; - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); break; default: @@ -1094,73 +1090,73 @@ static void __perf_counter_sync_stat(struct perf_counter *counter, } /* - * In order to keep per-task stats reliable we need to flip the counter + * In order to keep per-task stats reliable we need to flip the event * values when we flip the contexts. */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); /* * Since we swizzled the values, update the user visible data too. */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); } #define list_next_entry(pos, member) \ list_entry(pos->member.next, typeof(*pos), member) -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) { - struct perf_counter *counter, *next_counter; + struct perf_event *event, *next_event; if (!ctx->nr_stat) return; - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { - __perf_counter_sync_stat(counter, next_counter); + __perf_event_sync_stat(event, next_event); - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); } } /* - * Called from scheduler to remove the counters of the current task, + * Called from scheduler to remove the events of the current task, * with interrupts disabled. * - * We stop each counter and update the counter value in counter->count. + * We stop each event and update the event value in event->count. * * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. */ -void perf_counter_task_sched_out(struct task_struct *task, +void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; struct pt_regs *regs; int do_switch = 1; regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); if (likely(!ctx || !cpuctx->task_ctx)) return; @@ -1169,7 +1165,7 @@ void perf_counter_task_sched_out(struct task_struct *task, rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; + next_ctx = next->perf_event_ctxp; if (parent && next_ctx && rcu_dereference(next_ctx->parent_ctx) == parent) { /* @@ -1186,15 +1182,15 @@ void perf_counter_task_sched_out(struct task_struct *task, if (context_equiv(ctx, next_ctx)) { /* * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp + * wrt to rcu_dereference() of perf_event_ctxp */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; ctx->task = next; next_ctx->task = task; do_switch = 0; - perf_counter_sync_stat(ctx, next_ctx); + perf_event_sync_stat(ctx, next_ctx); } spin_unlock(&next_ctx->lock); spin_unlock(&ctx->lock); @@ -1202,7 +1198,7 @@ void perf_counter_task_sched_out(struct task_struct *task, rcu_read_unlock(); if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); + __perf_event_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } } @@ -1210,7 +1206,7 @@ void perf_counter_task_sched_out(struct task_struct *task, /* * Called with IRQs disabled */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +static void __perf_event_task_sched_out(struct perf_event_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); @@ -1220,28 +1216,28 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) return; - __perf_counter_sched_out(ctx, cpuctx); + __perf_event_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } /* * Called with IRQs disabled */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) { - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); + __perf_event_sched_out(&cpuctx->ctx, cpuctx); } static void -__perf_counter_sched_in(struct perf_counter_context *ctx, +__perf_event_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { - struct perf_counter *counter; + struct perf_event *event; int can_add_hw = 1; spin_lock(&ctx->lock); ctx->is_active = 1; - if (likely(!ctx->nr_counters)) + if (likely(!ctx->nr_events)) goto out; ctx->timestamp = perf_clock(); @@ -1252,55 +1248,45 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) continue; - if (counter->cpu != -1 && counter->cpu != cpu) + if (event->cpu != -1 && event->cpu != cpu) continue; - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); /* * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; } } - list_for_each_entry(counter, &ctx->counter_list, list_entry) { + list_for_each_entry(event, &ctx->group_list, group_entry) { /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) continue; /* * Listen to the 'cpu' scheduling filter constraint - * of counters: + * of events: */ - if (counter->cpu != -1 && counter->cpu != cpu) + if (event->cpu != -1 && event->cpu != cpu) continue; - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) + if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_sched_in(event, cpuctx, ctx, cpu)) can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } } perf_enable(); out: @@ -1308,48 +1294,48 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, } /* - * Called from scheduler to add the counters of the current task + * Called from scheduler to add the events of the current task * with interrupts disabled. * - * We restore the counter value and then enable it. + * We restore the event value and then enable it. * * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) +void perf_event_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_event_context *ctx = task->perf_event_ctxp; if (likely(!ctx)) return; if (cpuctx->task_ctx == ctx) return; - __perf_counter_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) { - struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_event_context *ctx = &cpuctx->ctx; - __perf_counter_sched_in(ctx, cpuctx, cpu); + __perf_event_sched_in(ctx, cpuctx, cpu); } #define MAX_INTERRUPTS (~0ULL) -static void perf_log_throttle(struct perf_counter *counter, int enable); +static void perf_log_throttle(struct perf_event *event, int enable); -static void perf_adjust_period(struct perf_counter *counter, u64 events) +static void perf_adjust_period(struct perf_event *event, u64 events) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; u64 period, sample_period; s64 delta; events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); + period = div64_u64(events, event->attr.sample_freq); delta = (s64)(period - hwc->sample_period); delta = (delta + 7) / 8; /* low pass filter */ @@ -1362,39 +1348,39 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) hwc->sample_period = sample_period; } -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) { - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct perf_event *event; + struct hw_perf_event *hwc; u64 interrupts, freq; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) continue; - hwc = &counter->hw; + hwc = &event->hw; interrupts = hwc->interrupts; hwc->interrupts = 0; /* - * unthrottle counters on the tick + * unthrottle events on the tick */ if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; } - if (!counter->attr.freq || !counter->attr.sample_freq) + if (!event->attr.freq || !event->attr.sample_freq) continue; /* * if the specified freq < HZ then we need to skip ticks */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; hwc->freq_count += freq; hwc->freq_interrupts += interrupts; @@ -1408,7 +1394,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) } else freq = HZ; - perf_adjust_period(counter, freq * interrupts); + perf_adjust_period(event, freq * interrupts); /* * In order to avoid being stalled by an (accidental) huge @@ -1417,9 +1403,9 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) */ if (!interrupts) { perf_disable(); - counter->pmu->disable(counter); + event->pmu->disable(event); atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); + event->pmu->enable(event); perf_enable(); } } @@ -1427,22 +1413,22 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) } /* - * Round-robin a context's counters: + * Round-robin a context's events: */ -static void rotate_ctx(struct perf_counter_context *ctx) +static void rotate_ctx(struct perf_event_context *ctx) { - struct perf_counter *counter; + struct perf_event *event; - if (!ctx->nr_counters) + if (!ctx->nr_events) return; spin_lock(&ctx->lock); /* - * Rotate the first entry last (works just fine for group counters too): + * Rotate the first entry last (works just fine for group events too): */ perf_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_move_tail(&counter->list_entry, &ctx->counter_list); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); break; } perf_enable(); @@ -1450,93 +1436,93 @@ static void rotate_ctx(struct perf_counter_context *ctx) spin_unlock(&ctx->lock); } -void perf_counter_task_tick(struct task_struct *curr, int cpu) +void perf_event_task_tick(struct task_struct *curr, int cpu) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_event_context *ctx; - if (!atomic_read(&nr_counters)) + if (!atomic_read(&nr_events)) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; + ctx = curr->perf_event_ctxp; perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); - perf_counter_cpu_sched_out(cpuctx); + perf_event_cpu_sched_out(cpuctx); if (ctx) - __perf_counter_task_sched_out(ctx); + __perf_event_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); if (ctx) rotate_ctx(ctx); - perf_counter_cpu_sched_in(cpuctx, cpu); + perf_event_cpu_sched_in(cpuctx, cpu); if (ctx) - perf_counter_task_sched_in(curr, cpu); + perf_event_task_sched_in(curr, cpu); } /* - * Enable all of a task's counters that have been marked enable-on-exec. + * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_counter_enable_on_exec(struct task_struct *task) +static void perf_event_enable_on_exec(struct task_struct *task) { - struct perf_counter_context *ctx; - struct perf_counter *counter; + struct perf_event_context *ctx; + struct perf_event *event; unsigned long flags; int enabled = 0; local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) goto out; - __perf_counter_task_sched_out(ctx); + __perf_event_task_sched_out(ctx); spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (!counter->attr.enable_on_exec) + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) continue; - __perf_counter_mark_enabled(counter, ctx); + __perf_event_mark_enabled(event, ctx); enabled = 1; } /* - * Unclone this context if we enabled any counter. + * Unclone this context if we enabled any event. */ if (enabled) unclone_ctx(ctx); spin_unlock(&ctx->lock); - perf_counter_task_sched_in(task, smp_processor_id()); + perf_event_task_sched_in(task, smp_processor_id()); out: local_irq_restore(flags); } /* - * Cross CPU call to read the hardware counter + * Cross CPU call to read the hardware event */ -static void __perf_counter_read(void *info) +static void __perf_event_read(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; unsigned long flags; /* * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. + * event->count would have been updated to a recent sample + * when the event was scheduled out. */ if (ctx->task && cpuctx->task_ctx != ctx) return; @@ -1544,56 +1530,56 @@ static void __perf_counter_read(void *info) local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); + event->pmu->read(event); + update_event_times(event); local_irq_restore(flags); } -static u64 perf_counter_read(struct perf_counter *counter) +static u64 perf_event_read(struct perf_event *event) { /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: + * If event is enabled and currently active on a CPU, update the + * value in the event structure: */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); } - return atomic64_read(&counter->count); + return atomic64_read(&event->count); } /* - * Initialize the perf_counter context in a task_struct: + * Initialize the perf_event context in a task_struct: */ static void -__perf_counter_init_context(struct perf_counter_context *ctx, +__perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->group_list); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); ctx->task = task; } -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +static struct perf_event_context *find_get_context(pid_t pid, int cpu) { - struct perf_counter_context *ctx; + struct perf_event_context *ctx; struct perf_cpu_context *cpuctx; struct task_struct *task; unsigned long flags; int err; /* - * If cpu is not a wildcard then this is a percpu counter: + * If cpu is not a wildcard then this is a percpu event: */ if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ + /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); @@ -1601,7 +1587,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(-EINVAL); /* - * We could be clever and allow to attach a counter to an + * We could be clever and allow to attach a event to an * offline CPU and activate it when the CPU comes up, but * that's for later. */ @@ -1628,7 +1614,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(-ESRCH); /* - * Can't attach counters to a dying task. + * Can't attach events to a dying task. */ err = -ESRCH; if (task->flags & PF_EXITING) @@ -1647,13 +1633,13 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; - __perf_counter_init_context(ctx, task); + __perf_event_init_context(ctx, task); get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. @@ -1672,42 +1658,42 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ERR_PTR(err); } -static void free_counter_rcu(struct rcu_head *head) +static void free_event_rcu(struct rcu_head *head) { - struct perf_counter *counter; + struct perf_event *event; - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); } -static void perf_pending_sync(struct perf_counter *counter); +static void perf_pending_sync(struct perf_event *event); -static void free_counter(struct perf_counter *counter) +static void free_event(struct perf_event *event) { - perf_pending_sync(counter); + perf_pending_sync(event); - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); } - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; + if (event->output) { + fput(event->output->filp); + event->output = NULL; } - if (counter->destroy) - counter->destroy(counter); + if (event->destroy) + event->destroy(event); - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); } /* @@ -1715,43 +1701,43 @@ static void free_counter(struct perf_counter *counter) */ static int perf_release(struct inode *inode, struct file *file) { - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; file->private_data = NULL; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); + perf_event_remove_from_context(event); mutex_unlock(&ctx->mutex); - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); - free_counter(counter); + free_event(event); return 0; } -static int perf_counter_read_size(struct perf_counter *counter) +static int perf_event_read_size(struct perf_event *event) { int entry = sizeof(u64); /* value */ int size = 0; int nr = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) size += sizeof(u64); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) size += sizeof(u64); - if (counter->attr.read_format & PERF_FORMAT_ID) + if (event->attr.read_format & PERF_FORMAT_ID) entry += sizeof(u64); - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; size += sizeof(u64); } @@ -1760,27 +1746,27 @@ static int perf_counter_read_size(struct perf_counter *counter) return size; } -static u64 perf_counter_read_value(struct perf_counter *counter) +static u64 perf_event_read_value(struct perf_event *event) { - struct perf_counter *child; + struct perf_event *child; u64 total = 0; - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); return total; } -static int perf_counter_read_entry(struct perf_counter *counter, +static int perf_event_read_entry(struct perf_event *event, u64 read_format, char __user *buf) { int n = 0, count = 0; u64 values[2]; - values[n++] = perf_counter_read_value(counter); + values[n++] = perf_event_read_value(event); if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + values[n++] = primary_event_id(event); count = n * sizeof(u64); @@ -1790,10 +1776,10 @@ static int perf_counter_read_entry(struct perf_counter *counter, return count; } -static int perf_counter_read_group(struct perf_counter *counter, +static int perf_event_read_group(struct perf_event *event, u64 read_format, char __user *buf) { - struct perf_counter *leader = counter->group_leader, *sub; + struct perf_event *leader = event->group_leader, *sub; int n = 0, size = 0, err = -EFAULT; u64 values[3]; @@ -1812,14 +1798,14 @@ static int perf_counter_read_group(struct perf_counter *counter, if (copy_to_user(buf, values, size)) return -EFAULT; - err = perf_counter_read_entry(leader, read_format, buf + size); + err = perf_event_read_entry(leader, read_format, buf + size); if (err < 0) return err; size += err; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - err = perf_counter_read_entry(sub, read_format, + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, buf + size); if (err < 0) return err; @@ -1830,23 +1816,23 @@ static int perf_counter_read_group(struct perf_counter *counter, return size; } -static int perf_counter_read_one(struct perf_counter *counter, +static int perf_event_read_one(struct perf_event *event, u64 read_format, char __user *buf) { u64 values[4]; int n = 0; - values[n++] = perf_counter_read_value(counter); + values[n++] = perf_event_read_value(event); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + values[n++] = primary_event_id(event); if (copy_to_user(buf, values, n * sizeof(u64))) return -EFAULT; @@ -1855,32 +1841,32 @@ static int perf_counter_read_one(struct perf_counter *counter, } /* - * Read the performance counter - simple non blocking version for now + * Read the performance event - simple non blocking version for now */ static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) { - u64 read_format = counter->attr.read_format; + u64 read_format = event->attr.read_format; int ret; /* - * Return end-of-file for a read on a counter that is in + * Return end-of-file for a read on a event that is in * error state (i.e. because it was pinned but it couldn't be * scheduled on to the CPU at some point). */ - if (counter->state == PERF_COUNTER_STATE_ERROR) + if (event->state == PERF_EVENT_STATE_ERROR) return 0; - if (count < perf_counter_read_size(counter)) + if (count < perf_event_read_size(event)) return -ENOSPC; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); + ret = perf_event_read_group(event, read_format, buf); else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); return ret; } @@ -1888,79 +1874,79 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct perf_counter *counter = file->private_data; + struct perf_event *event = file->private_data; - return perf_read_hw(counter, buf, count); + return perf_read_hw(event, buf, count); } static unsigned int perf_poll(struct file *file, poll_table *wait) { - struct perf_counter *counter = file->private_data; + struct perf_event *event = file->private_data; struct perf_mmap_data *data; unsigned int events = POLL_HUP; rcu_read_lock(); - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (data) events = atomic_xchg(&data->poll, 0); rcu_read_unlock(); - poll_wait(file, &counter->waitq, wait); + poll_wait(file, &event->waitq, wait); return events; } -static void perf_counter_reset(struct perf_counter *counter) +static void perf_event_reset(struct perf_event *event) { - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); } /* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) { - struct perf_counter *child; + struct perf_event *child; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) func(child); - mutex_unlock(&counter->child_mutex); + mutex_unlock(&event->child_mutex); } -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) { - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); - counter = counter->group_leader; + event = event->group_leader; - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, list_entry) - perf_counter_for_each_child(counter, func); + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); mutex_unlock(&ctx->mutex); } -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct perf_counter_context *ctx = counter->ctx; + struct perf_event_context *ctx = event->ctx; unsigned long size; int ret = 0; u64 value; - if (!counter->attr.sample_period) + if (!event->attr.sample_period) return -EINVAL; size = copy_from_user(&value, arg, sizeof(value)); @@ -1971,16 +1957,16 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) return -EINVAL; spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { ret = -EINVAL; goto unlock; } - counter->attr.sample_freq = value; + event->attr.sample_freq = value; } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; + event->attr.sample_period = value; + event->hw.sample_period = value; } unlock: spin_unlock_irq(&ctx->lock); @@ -1988,80 +1974,80 @@ unlock: return ret; } -int perf_counter_set_output(struct perf_counter *counter, int output_fd); +int perf_event_set_output(struct perf_event *event, int output_fd); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); u32 flags = arg; switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; break; - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); default: return -ENOTTY; } if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); + perf_event_for_each(event, func); else - perf_counter_for_each_child(counter, func); + perf_event_for_each_child(event, func); return 0; } -int perf_counter_task_enable(void) +int perf_event_task_enable(void) { - struct perf_counter *counter; + struct perf_event *event; - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); return 0; } -int perf_counter_task_disable(void) +int perf_event_task_disable(void) { - struct perf_counter *counter; + struct perf_event *event; - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); return 0; } -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 #endif -static int perf_counter_index(struct perf_counter *counter) +static int perf_event_index(struct perf_event *event) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; } /* @@ -2069,13 +2055,13 @@ static int perf_counter_index(struct perf_counter *counter) * the seqlock logic goes bad. We can not serialize this because the arch * code calls this from NMI context. */ -void perf_counter_update_userpage(struct perf_counter *counter) +void perf_event_update_userpage(struct perf_event *event) { - struct perf_counter_mmap_page *userpg; + struct perf_event_mmap_page *userpg; struct perf_mmap_data *data; rcu_read_lock(); - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (!data) goto unlock; @@ -2088,16 +2074,16 @@ void perf_counter_update_userpage(struct perf_counter *counter) preempt_disable(); ++userpg->lock; barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); barrier(); ++userpg->lock; @@ -2106,55 +2092,37 @@ unlock: rcu_read_unlock(); } -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +static unsigned long perf_data_size(struct perf_mmap_data *data) { - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; + return data->nr_pages << (PAGE_SHIFT + data->data_order); +} - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; +#ifndef CONFIG_PERF_USE_VMALLOC - vmf->page = virt_to_page(data->data_pages[nr]); - } +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > data->nr_pages) + return NULL; - ret = 0; -unlock: - rcu_read_unlock(); + if (pgoff == 0) + return virt_to_page(data->user_page); - return ret; + return virt_to_page(data->data_pages[pgoff - 1]); } -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { struct perf_mmap_data *data; unsigned long size; int i; - WARN_ON(atomic_read(&counter->mmap_count)); + WARN_ON(atomic_read(&event->mmap_count)); size = sizeof(struct perf_mmap_data); size += nr_pages * sizeof(void *); @@ -2173,19 +2141,10 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) goto fail_data_pages; } + data->data_order = 0; data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - if (counter->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - counter->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - - rcu_assign_pointer(counter->data, data); - return 0; + return data; fail_data_pages: for (i--; i >= 0; i--) @@ -2197,7 +2156,7 @@ fail_user_page: kfree(data); fail: - return -ENOMEM; + return NULL; } static void perf_mmap_free_page(unsigned long addr) @@ -2208,53 +2167,195 @@ static void perf_mmap_free_page(unsigned long addr) __free_page(page); } -static void __perf_mmap_data_free(struct rcu_head *rcu_head) +static void perf_mmap_data_free(struct perf_mmap_data *data) { - struct perf_mmap_data *data; int i; - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - perf_mmap_free_page((unsigned long)data->user_page); for (i = 0; i < data->nr_pages; i++) perf_mmap_free_page((unsigned long)data->data_pages[i]); +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static struct page * +perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) +{ + if (pgoff > (1UL << data->data_order)) + return NULL; + + return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + page->mapping = NULL; +} + +static void perf_mmap_data_free_work(struct work_struct *work) +{ + struct perf_mmap_data *data; + void *base; + int i, nr; + + data = container_of(work, struct perf_mmap_data, work); + nr = 1 << data->data_order; + + base = data->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); +} + +static void perf_mmap_data_free(struct perf_mmap_data *data) +{ + schedule_work(&data->work); +} + +static struct perf_mmap_data * +perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + void *all_buf; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + INIT_WORK(&data->work, perf_mmap_data_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + data->user_page = all_buf; + data->data_pages[0] = all_buf + PAGE_SIZE; + data->data_order = ilog2(nr_pages); + data->nr_pages = 1; + + return data; + +fail_all_buf: kfree(data); + +fail: + return NULL; } -static void perf_mmap_data_free(struct perf_counter *counter) +#endif + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct perf_mmap_data *data = counter->data; + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; - WARN_ON(atomic_read(&counter->mmap_count)); + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(data, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void +perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) +{ + long max_size = perf_data_size(data); + + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, max_size, + event->attr.wakeup_watermark); + } + + if (!data->watermark) + data->watermark = max_t(long, PAGE_SIZE, max_size / 2); + + + rcu_assign_pointer(event->data, data); +} + +static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + perf_mmap_data_free(data); + kfree(data); +} + +static void perf_mmap_data_release(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } static void perf_mmap_open(struct vm_area_struct *vma) { - struct perf_counter *counter = vma->vm_file->private_data; + struct perf_event *event = vma->vm_file->private_data; - atomic_inc(&counter->mmap_count); + atomic_inc(&event->mmap_count); } static void perf_mmap_close(struct vm_area_struct *vma) { - struct perf_counter *counter = vma->vm_file->private_data; + struct perf_event *event = vma->vm_file->private_data; - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->data); struct user_struct *user = current_user(); - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_release(event); + mutex_unlock(&event->mmap_mutex); } } -static struct vm_operations_struct perf_mmap_vmops = { +static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, .close = perf_mmap_close, .fault = perf_mmap_fault, @@ -2263,10 +2364,11 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { - struct perf_counter *counter = file->private_data; + struct perf_event *event = file->private_data; unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; + struct perf_mmap_data *data; unsigned long vma_size; unsigned long nr_pages; long user_extra, extra; @@ -2291,21 +2393,21 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { ret = -EINVAL; goto unlock; } - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) ret = -EINVAL; goto unlock; } user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* * Increase the limit linearly with more CPUs: @@ -2328,20 +2430,25 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) + WARN_ON(event->data); + + data = perf_mmap_data_alloc(event, nr_pages); + ret = -ENOMEM; + if (!data) goto unlock; - atomic_set(&counter->mmap_count, 1); + ret = 0; + perf_mmap_data_init(event, data); + + atomic_set(&event->mmap_count, 1); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; + event->data->nr_locked = extra; if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; + event->data->writable = 1; unlock: - mutex_unlock(&counter->mmap_mutex); + mutex_unlock(&event->mmap_mutex); vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; @@ -2352,11 +2459,11 @@ unlock: static int perf_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; + struct perf_event *event = filp->private_data; int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); + retval = fasync_helper(fd, filp, on, &event->fasync); mutex_unlock(&inode->i_mutex); if (retval < 0) @@ -2376,19 +2483,19 @@ static const struct file_operations perf_fops = { }; /* - * Perf counter wakeup + * Perf event wakeup * * If there's data, ensure we set the poll() state and publish everything * to user-space before waking everybody up. */ -void perf_counter_wakeup(struct perf_counter *counter) +void perf_event_wakeup(struct perf_event *event) { - wake_up_all(&counter->waitq); + wake_up_all(&event->waitq); - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; } } @@ -2401,19 +2508,19 @@ void perf_counter_wakeup(struct perf_counter *counter) * single linked list and use cmpxchg() to add entries lockless. */ -static void perf_pending_counter(struct perf_pending_entry *entry) +static void perf_pending_event(struct perf_pending_entry *entry) { - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); + struct perf_event *event = container_of(entry, + struct perf_event, pending); - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); } - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); } } @@ -2439,7 +2546,7 @@ static void perf_pending_queue(struct perf_pending_entry *entry, entry->next = *head; } while (cmpxchg(head, entry->next, entry) != entry->next); - set_perf_counter_pending(); + set_perf_event_pending(); put_cpu_var(perf_pending_head); } @@ -2472,7 +2579,7 @@ static int __perf_pending_run(void) return nr; } -static inline int perf_not_pending(struct perf_counter *counter) +static inline int perf_not_pending(struct perf_event *event) { /* * If we flush on whatever cpu we run, there is a chance we don't @@ -2487,15 +2594,15 @@ static inline int perf_not_pending(struct perf_counter *counter) * so that we do not miss the wakeup. -- see perf_pending_handle() */ smp_rmb(); - return counter->pending.next == NULL; + return event->pending.next == NULL; } -static void perf_pending_sync(struct perf_counter *counter) +static void perf_pending_sync(struct perf_event *event) { - wait_event(counter->waitq, perf_not_pending(counter)); + wait_event(event->waitq, perf_not_pending(event)); } -void perf_counter_do_pending(void) +void perf_event_do_pending(void) { __perf_pending_run(); } @@ -2520,7 +2627,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, if (!data->writable) return true; - mask = (data->nr_pages << PAGE_SHIFT) - 1; + mask = perf_data_size(data) - 1; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -2536,25 +2643,25 @@ static void perf_output_wakeup(struct perf_output_handle *handle) atomic_set(&handle->data->poll, POLL_IN); if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); } else - perf_counter_wakeup(handle->counter); + perf_event_wakeup(handle->event); } /* * Curious locking construct. * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * * What we do is serialize between CPUs so we only have to deal with NMI * nesting on a single CPU. * * We only publish the head (and generate a wakeup) when the outer-most - * event completes. + * event_id completes. */ static void perf_output_lock(struct perf_output_handle *handle) { @@ -2625,7 +2732,7 @@ void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { unsigned int pages_mask; - unsigned int offset; + unsigned long offset; unsigned int size; void **pages; @@ -2634,12 +2741,14 @@ void perf_output_copy(struct perf_output_handle *handle, pages = handle->data->data_pages; do { - unsigned int page_offset; + unsigned long page_offset; + unsigned long page_size; int nr; nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); + page_offset = offset & (page_size - 1); + size = min_t(unsigned int, page_size - page_offset, len); memcpy(pages[nr] + page_offset, buf, size); @@ -2658,10 +2767,10 @@ void perf_output_copy(struct perf_output_handle *handle, } int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, + struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_counter *output_counter; + struct perf_event *output_event; struct perf_mmap_data *data; unsigned long tail, offset, head; int have_lost; @@ -2673,21 +2782,21 @@ int perf_output_begin(struct perf_output_handle *handle, rcu_read_lock(); /* - * For inherited counters we send all the output towards the parent. + * For inherited events we send all the output towards the parent. */ - if (counter->parent) - counter = counter->parent; + if (event->parent) + event = event->parent; - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; - data = rcu_dereference(counter->data); + data = rcu_dereference(event->data); if (!data) goto out; handle->data = data; - handle->counter = counter; + handle->event = event; handle->nmi = nmi; handle->sample = sample; @@ -2721,10 +2830,10 @@ int perf_output_begin(struct perf_output_handle *handle, atomic_set(&data->wakeup, 1); if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; + lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; + lost_event.id = event->id; lost_event.lost = atomic_xchg(&data->lost, 0); perf_output_put(handle, lost_event); @@ -2743,10 +2852,10 @@ out: void perf_output_end(struct perf_output_handle *handle) { - struct perf_counter *counter = handle->counter; + struct perf_event *event = handle->event; struct perf_mmap_data *data = handle->data; - int wakeup_events = counter->attr.wakeup_events; + int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { int events = atomic_inc_return(&data->events); @@ -2760,58 +2869,58 @@ void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) { /* - * only top level counters have the pid namespace they were created in + * only top level events have the pid namespace they were created in */ - if (counter->parent) - counter = counter->parent; + if (event->parent) + event = event->parent; - return task_tgid_nr_ns(p, counter->ns); + return task_tgid_nr_ns(p, event->ns); } -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) { /* - * only top level counters have the pid namespace they were created in + * only top level events have the pid namespace they were created in */ - if (counter->parent) - counter = counter->parent; + if (event->parent) + event = event->parent; - return task_pid_nr_ns(p, counter->ns); + return task_pid_nr_ns(p, event->ns); } static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) + struct perf_event *event) { - u64 read_format = counter->attr.read_format; + u64 read_format = event->attr.read_format; u64 values[4]; int n = 0; - values[n++] = atomic64_read(&counter->count); + values[n++] = atomic64_read(&event->count); if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); } if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); } if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + values[n++] = primary_event_id(event); perf_output_copy(handle, values, n * sizeof(u64)); } /* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. */ static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) + struct perf_event *event) { - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; u64 values[5]; int n = 0; @@ -2823,42 +2932,42 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = leader->total_time_running; - if (leader != counter) + if (leader != event) leader->pmu->read(leader); values[n++] = atomic64_read(&leader->count); if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); + values[n++] = primary_event_id(leader); perf_output_copy(handle, values, n * sizeof(u64)); - list_for_each_entry(sub, &leader->sibling_list, list_entry) { + list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; - if (sub != counter) + if (sub != event) sub->pmu->read(sub); values[n++] = atomic64_read(&sub->count); if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); + values[n++] = primary_event_id(sub); perf_output_copy(handle, values, n * sizeof(u64)); } } static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) + struct perf_event *event) { - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); else - perf_output_read_one(handle, counter); + perf_output_read_one(handle, event); } void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, - struct perf_counter *counter) + struct perf_event *event) { u64 sample_type = data->type; @@ -2889,7 +2998,7 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, data->period); if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, counter); + perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (data->callchain) { @@ -2927,14 +3036,14 @@ void perf_output_sample(struct perf_output_handle *handle, void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, - struct perf_counter *counter, + struct perf_event *event, struct pt_regs *regs) { - u64 sample_type = counter->attr.sample_type; + u64 sample_type = event->attr.sample_type; data->type = sample_type; - header->type = PERF_EVENT_SAMPLE; + header->type = PERF_RECORD_SAMPLE; header->size = sizeof(*header); header->misc = 0; @@ -2948,8 +3057,8 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ - data->tid_entry.pid = perf_counter_pid(counter, current); - data->tid_entry.tid = perf_counter_tid(counter, current); + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); header->size += sizeof(data->tid_entry); } @@ -2964,13 +3073,13 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += sizeof(data->addr); if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_counter_id(counter); + data->id = primary_event_id(event); header->size += sizeof(data->id); } if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = counter->id; + data->stream_id = event->id; header->size += sizeof(data->stream_id); } @@ -2986,7 +3095,7 @@ void perf_prepare_sample(struct perf_event_header *header, header->size += sizeof(data->period); if (sample_type & PERF_SAMPLE_READ) - header->size += perf_counter_read_size(counter); + header->size += perf_event_read_size(event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { int size = 1; @@ -3012,25 +3121,25 @@ void perf_prepare_sample(struct perf_event_header *header, } } -static void perf_counter_output(struct perf_counter *counter, int nmi, +static void perf_event_output(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_output_handle handle; struct perf_event_header header; - perf_prepare_sample(&header, data, counter, regs); + perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, counter, header.size, nmi, 1)) + if (perf_output_begin(&handle, event, header.size, nmi, 1)) return; - perf_output_sample(&handle, &header, data, counter); + perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); } /* - * read event + * read event_id */ struct perf_read_event { @@ -3041,27 +3150,27 @@ struct perf_read_event { }; static void -perf_counter_read_event(struct perf_counter *counter, +perf_event_read_event(struct perf_event *event, struct task_struct *task) { struct perf_output_handle handle; - struct perf_read_event event = { + struct perf_read_event read_event = { .header = { - .type = PERF_EVENT_READ, + .type = PERF_RECORD_READ, .misc = 0, - .size = sizeof(event) + perf_counter_read_size(counter), + .size = sizeof(read_event) + perf_event_read_size(event), }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), }; int ret; - ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); if (ret) return; - perf_output_put(&handle, event); - perf_output_read(&handle, counter); + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); perf_output_end(&handle); } @@ -3074,7 +3183,7 @@ perf_counter_read_event(struct perf_counter *counter, struct perf_task_event { struct task_struct *task; - struct perf_counter_context *task_ctx; + struct perf_event_context *task_ctx; struct { struct perf_event_header header; @@ -3084,10 +3193,10 @@ struct perf_task_event { u32 tid; u32 ptid; u64 time; - } event; + } event_id; }; -static void perf_counter_task_output(struct perf_counter *counter, +static void perf_event_task_output(struct perf_event *event, struct perf_task_event *task_event) { struct perf_output_handle handle; @@ -3095,85 +3204,85 @@ static void perf_counter_task_output(struct perf_counter *counter, struct task_struct *task = task_event->task; int ret; - size = task_event->event.header.size; - ret = perf_output_begin(&handle, counter, size, 0, 0); + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); if (ret) return; - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event.time = perf_clock(); + task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event); + perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); } -static int perf_counter_task_match(struct perf_counter *counter) +static int perf_event_task_match(struct perf_event *event) { - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) + if (event->attr.comm || event->attr.mmap || event->attr.task) return 1; return 0; } -static void perf_counter_task_ctx(struct perf_counter_context *ctx, +static void perf_event_task_ctx(struct perf_event_context *ctx, struct perf_task_event *task_event) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); } rcu_read_unlock(); } -static void perf_counter_task_event(struct perf_task_event *task_event) +static void perf_event_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; + struct perf_event_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); + perf_event_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); + ctx = rcu_dereference(task_event->task->perf_event_ctxp); if (ctx) - perf_counter_task_ctx(ctx, task_event); + perf_event_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, int new) { struct perf_task_event task_event; - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) return; task_event = (struct perf_task_event){ .task = task, .task_ctx = task_ctx, - .event = { + .event_id = { .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, .misc = 0, - .size = sizeof(task_event.event), + .size = sizeof(task_event.event_id), }, /* .pid */ /* .ppid */ @@ -3182,12 +3291,12 @@ static void perf_counter_task(struct task_struct *task, }, }; - perf_counter_task_event(&task_event); + perf_event_task_event(&task_event); } -void perf_counter_fork(struct task_struct *task) +void perf_event_fork(struct task_struct *task) { - perf_counter_task(task, NULL, 1); + perf_event_task(task, NULL, 1); } /* @@ -3204,56 +3313,56 @@ struct perf_comm_event { u32 pid; u32 tid; - } event; + } event_id; }; -static void perf_counter_comm_output(struct perf_counter *counter, +static void perf_event_comm_output(struct perf_event *event, struct perf_comm_event *comm_event) { struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); if (ret) return; - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); - perf_output_put(&handle, comm_event->event); + perf_output_put(&handle, comm_event->event_id); perf_output_copy(&handle, comm_event->comm, comm_event->comm_size); perf_output_end(&handle); } -static int perf_counter_comm_match(struct perf_counter *counter) +static int perf_event_comm_match(struct perf_event *event) { - if (counter->attr.comm) + if (event->attr.comm) return 1; return 0; } -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, +static void perf_event_comm_ctx(struct perf_event_context *ctx, struct perf_comm_event *comm_event) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); } rcu_read_unlock(); } -static void perf_counter_comm_event(struct perf_comm_event *comm_event) +static void perf_event_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_event_context *ctx; unsigned int size; char comm[TASK_COMM_LEN]; @@ -3264,10 +3373,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) comm_event->comm = comm; comm_event->comm_size = size; - comm_event->event.header.size = sizeof(comm_event->event) + size; + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -3275,29 +3384,29 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) * doesn't really matter which of the child contexts the * events ends up in. */ - ctx = rcu_dereference(current->perf_counter_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_counter_comm_ctx(ctx, comm_event); + perf_event_comm_ctx(ctx, comm_event); rcu_read_unlock(); } -void perf_counter_comm(struct task_struct *task) +void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); - if (!atomic_read(&nr_comm_counters)) + if (!atomic_read(&nr_comm_events)) return; comm_event = (struct perf_comm_event){ .task = task, /* .comm */ /* .comm_size */ - .event = { + .event_id = { .header = { - .type = PERF_EVENT_COMM, + .type = PERF_RECORD_COMM, .misc = 0, /* .size */ }, @@ -3306,7 +3415,7 @@ void perf_counter_comm(struct task_struct *task) }, }; - perf_counter_comm_event(&comm_event); + perf_event_comm_event(&comm_event); } /* @@ -3327,57 +3436,57 @@ struct perf_mmap_event { u64 start; u64 len; u64 pgoff; - } event; + } event_id; }; -static void perf_counter_mmap_output(struct perf_counter *counter, +static void perf_event_mmap_output(struct perf_event *event, struct perf_mmap_event *mmap_event) { struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); if (ret) return; - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); - perf_output_put(&handle, mmap_event->event); + perf_output_put(&handle, mmap_event->event_id); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); perf_output_end(&handle); } -static int perf_counter_mmap_match(struct perf_counter *counter, +static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { - if (counter->attr.mmap) + if (event->attr.mmap) return 1; return 0; } -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, +static void perf_event_mmap_ctx(struct perf_event_context *ctx, struct perf_mmap_event *mmap_event) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); } rcu_read_unlock(); } -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_event_context *ctx; struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; unsigned int size; @@ -3425,10 +3534,10 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; - mmap_event->event.header.size = sizeof(mmap_event->event) + size; + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -3436,28 +3545,28 @@ got_name: * doesn't really matter which of the child contexts the * events ends up in. */ - ctx = rcu_dereference(current->perf_counter_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); + perf_event_mmap_ctx(ctx, mmap_event); rcu_read_unlock(); kfree(buf); } -void __perf_counter_mmap(struct vm_area_struct *vma) +void __perf_event_mmap(struct vm_area_struct *vma) { struct perf_mmap_event mmap_event; - if (!atomic_read(&nr_mmap_counters)) + if (!atomic_read(&nr_mmap_events)) return; mmap_event = (struct perf_mmap_event){ .vma = vma, /* .file_name */ /* .file_size */ - .event = { + .event_id = { .header = { - .type = PERF_EVENT_MMAP, + .type = PERF_RECORD_MMAP, .misc = 0, /* .size */ }, @@ -3469,14 +3578,14 @@ void __perf_counter_mmap(struct vm_area_struct *vma) }, }; - perf_counter_mmap_event(&mmap_event); + perf_event_mmap_event(&mmap_event); } /* * IRQ throttle logging */ -static void perf_log_throttle(struct perf_counter *counter, int enable) +static void perf_log_throttle(struct perf_event *event, int enable) { struct perf_output_handle handle; int ret; @@ -3488,19 +3597,19 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) u64 stream_id; } throttle_event = { .header = { - .type = PERF_EVENT_THROTTLE, + .type = PERF_RECORD_THROTTLE, .misc = 0, .size = sizeof(throttle_event), }, .time = perf_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, + .id = primary_event_id(event), + .stream_id = event->id, }; if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); if (ret) return; @@ -3509,18 +3618,18 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) } /* - * Generic counter overflow handling, sampling. + * Generic event overflow handling, sampling. */ -static int __perf_counter_overflow(struct perf_counter *counter, int nmi, +static int __perf_event_overflow(struct perf_event *event, int nmi, int throttle, struct perf_sample_data *data, struct pt_regs *regs) { - int events = atomic_read(&counter->event_limit); - struct hw_perf_counter *hwc = &counter->hw; + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; int ret = 0; - throttle = (throttle && counter->pmu->unthrottle != NULL); + throttle = (throttle && event->pmu->unthrottle != NULL); if (!throttle) { hwc->interrupts++; @@ -3528,73 +3637,73 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, if (hwc->interrupts != MAX_INTERRUPTS) { hwc->interrupts++; if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { + (u64)sysctl_perf_event_sample_rate) { hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); + perf_log_throttle(event, 0); ret = 1; } } else { /* - * Keep re-disabling counters even though on the previous + * Keep re-disabling events even though on the previous * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: + * sched-in and the event got enabled again: */ ret = 1; } } - if (counter->attr.freq) { + if (event->attr.freq) { u64 now = perf_clock(); s64 delta = now - hwc->freq_stamp; hwc->freq_stamp = now; if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); } /* * XXX event_limit might not quite work as expected on inherited - * counters + * events */ - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { ret = 1; - counter->pending_kill = POLL_HUP; + event->pending_kill = POLL_HUP; if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); } else - perf_counter_disable(counter); + perf_event_disable(event); } - perf_counter_output(counter, nmi, data, regs); + perf_event_output(event, nmi, data, regs); return ret; } -int perf_counter_overflow(struct perf_counter *counter, int nmi, +int perf_event_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - return __perf_counter_overflow(counter, nmi, 1, data, regs); + return __perf_event_overflow(event, nmi, 1, data, regs); } /* - * Generic software counter infrastructure + * Generic software event infrastructure */ /* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event * is kept in the range [-sample_period, 0] so that we can use the * sign as trigger. */ -static u64 perf_swcounter_set_period(struct perf_counter *counter) +static u64 perf_swevent_set_period(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; u64 period = hwc->last_period; u64 nr, offset; s64 old, val; @@ -3615,22 +3724,22 @@ again: return nr; } -static void perf_swcounter_overflow(struct perf_counter *counter, +static void perf_swevent_overflow(struct perf_event *event, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int throttle = 0; u64 overflow; - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); if (hwc->interrupts == MAX_INTERRUPTS) return; for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, + if (__perf_event_overflow(event, nmi, throttle, data, regs)) { /* * We inhibit the overflow from happening when @@ -3642,20 +3751,20 @@ static void perf_swcounter_overflow(struct perf_counter *counter, } } -static void perf_swcounter_unthrottle(struct perf_counter *counter) +static void perf_swevent_unthrottle(struct perf_event *event) { /* * Nothing to do, we already reset hwc->interrupts. */ } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, +static void perf_swevent_add(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; - atomic64_add(nr, &counter->count); + atomic64_add(nr, &event->count); if (!hwc->sample_period) return; @@ -3664,29 +3773,29 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, return; if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data, regs); + perf_swevent_overflow(event, nmi, data, regs); } -static int perf_swcounter_is_counting(struct perf_counter *counter) +static int perf_swevent_is_counting(struct perf_event *event) { /* - * The counter is active, we're good! + * The event is active, we're good! */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) + if (event->state == PERF_EVENT_STATE_ACTIVE) return 1; /* - * The counter is off/error, not counting. + * The event is off/error, not counting. */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) + if (event->state != PERF_EVENT_STATE_INACTIVE) return 0; /* - * The counter is inactive, if the context is active + * The event is inactive, if the context is active * we're part of a group that didn't make it on the 'pmu', * not counting. */ - if (counter->ctx->is_active) + if (event->ctx->is_active) return 0; /* @@ -3697,49 +3806,49 @@ static int perf_swcounter_is_counting(struct perf_counter *counter) return 1; } -static int perf_swcounter_match(struct perf_counter *counter, +static int perf_swevent_match(struct perf_event *event, enum perf_type_id type, - u32 event, struct pt_regs *regs) + u32 event_id, struct pt_regs *regs) { - if (!perf_swcounter_is_counting(counter)) + if (!perf_swevent_is_counting(event)) return 0; - if (counter->attr.type != type) + if (event->attr.type != type) return 0; - if (counter->attr.config != event) + if (event->attr.config != event_id) return 0; if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) + if (event->attr.exclude_user && user_mode(regs)) return 0; - if (counter->attr.exclude_kernel && !user_mode(regs)) + if (event->attr.exclude_kernel && !user_mode(regs)) return 0; } return 1; } -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, +static void perf_swevent_ctx_event(struct perf_event_context *ctx, enum perf_type_id type, - u32 event, u64 nr, int nmi, + u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_counter *counter; + struct perf_event *event; if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, data, regs); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); } rcu_read_unlock(); } -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) { if (in_nmi()) return &cpuctx->recursion[3]; @@ -3753,14 +3862,14 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) return &cpuctx->recursion[0]; } -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; if (*recursion) goto out; @@ -3768,16 +3877,16 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event, (*recursion)++; barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, nr, nmi, data, regs); rcu_read_lock(); /* * doesn't really matter which of the child contexts the * events ends up in. */ - ctx = rcu_dereference(current->perf_counter_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); rcu_read_unlock(); barrier(); @@ -3787,57 +3896,57 @@ out: put_cpu_var(perf_cpu_context); } -void __perf_swcounter_event(u32 event, u64 nr, int nmi, +void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { .addr = addr, }; - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); } -static void perf_swcounter_read(struct perf_counter *counter) +static void perf_swevent_read(struct perf_event *event) { } -static int perf_swcounter_enable(struct perf_counter *counter) +static int perf_swevent_enable(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; if (hwc->sample_period) { hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); + perf_swevent_set_period(event); } return 0; } -static void perf_swcounter_disable(struct perf_counter *counter) +static void perf_swevent_disable(struct perf_event *event) { } static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, }; /* - * hrtimer based swcounter callback + * hrtimer based swevent callback */ -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; struct pt_regs *regs; - struct perf_counter *counter; + struct perf_event *event; u64 period; - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); data.addr = 0; regs = get_irq_regs(); @@ -3845,45 +3954,45 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->attr.exclude_kernel || !regs) && - !counter->attr.exclude_user) + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) regs = task_pt_regs(current); if (regs) { - if (perf_counter_overflow(counter, 0, &data, regs)) + if (perf_event_overflow(event, 0, &data, regs)) ret = HRTIMER_NORESTART; } - period = max_t(u64, 10000, counter->hw.sample_period); + period = max_t(u64, 10000, event->hw.sample_period); hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } /* - * Software counter: cpu wall time clock + * Software event: cpu wall time clock */ -static void cpu_clock_perf_counter_update(struct perf_counter *counter) +static void cpu_clock_perf_event_update(struct perf_event *event) { int cpu = raw_smp_processor_id(); s64 prev; u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); } -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +static int cpu_clock_perf_event_enable(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; + hwc->hrtimer.function = perf_swevent_hrtimer; if (hwc->sample_period) { u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, @@ -3894,48 +4003,48 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) return 0; } -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +static void cpu_clock_perf_event_disable(struct perf_event *event) { - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); } -static void cpu_clock_perf_counter_read(struct perf_counter *counter) +static void cpu_clock_perf_event_read(struct perf_event *event) { - cpu_clock_perf_counter_update(counter); + cpu_clock_perf_event_update(event); } static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, }; /* - * Software counter: task time clock + * Software event: task time clock */ -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +static void task_clock_perf_event_update(struct perf_event *event, u64 now) { u64 prev; s64 delta; - prev = atomic64_xchg(&counter->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &counter->count); + atomic64_add(delta, &event->count); } -static int task_clock_perf_counter_enable(struct perf_counter *counter) +static int task_clock_perf_event_enable(struct perf_event *event) { - struct hw_perf_counter *hwc = &counter->hw; + struct hw_perf_event *hwc = &event->hw; u64 now; - now = counter->ctx->time; + now = event->ctx->time; atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; + hwc->hrtimer.function = perf_swevent_hrtimer; if (hwc->sample_period) { u64 period = max_t(u64, 10000, hwc->sample_period); __hrtimer_start_range_ns(&hwc->hrtimer, @@ -3946,38 +4055,38 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) return 0; } -static void task_clock_perf_counter_disable(struct perf_counter *counter) +static void task_clock_perf_event_disable(struct perf_event *event) { - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); } -static void task_clock_perf_counter_read(struct perf_counter *counter) +static void task_clock_perf_event_read(struct perf_event *event) { u64 time; if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; + update_context_time(event->ctx); + time = event->ctx->time; } else { u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; } - task_clock_perf_counter_update(counter, time); + task_clock_perf_event_update(event, time); } static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { struct perf_raw_record raw = { @@ -3995,78 +4104,78 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, if (!regs) regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data, regs); } -EXPORT_SYMBOL_GPL(perf_tpcounter_event); +EXPORT_SYMBOL_GPL(perf_tp_event); extern int ftrace_profile_enable(int); extern void ftrace_profile_disable(int); -static void tp_perf_counter_destroy(struct perf_counter *counter) +static void tp_perf_event_destroy(struct perf_event *event) { - ftrace_profile_disable(counter->attr.config); + ftrace_profile_disable(event->attr.config); } -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_event_init(struct perf_event *event) { /* * Raw tracepoint data is a severe data leak, only allow root to * have these. */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (ftrace_profile_enable(counter->attr.config)) + if (ftrace_profile_enable(event->attr.config)) return NULL; - counter->destroy = tp_perf_counter_destroy; + event->destroy = tp_perf_event_destroy; return &perf_ops_generic; } #else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } #endif -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; -static void sw_perf_counter_destroy(struct perf_counter *counter) +static void sw_perf_event_destroy(struct perf_event *event) { - u64 event = counter->attr.config; + u64 event_id = event->attr.config; - WARN_ON(counter->parent); + WARN_ON(event->parent); - atomic_dec(&perf_swcounter_enabled[event]); + atomic_dec(&perf_swevent_enabled[event_id]); } -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) +static const struct pmu *sw_perf_event_init(struct perf_event *event) { const struct pmu *pmu = NULL; - u64 event = counter->attr.config; + u64 event_id = event->attr.config; /* - * Software counters (currently) can't in general distinguish + * Software events (currently) can't in general distinguish * between user, kernel and hypervisor events. * However, context switches and cpu migrations are considered * to be kernel events, and page faults are never hypervisor * events. */ - switch (event) { + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; break; case PERF_COUNT_SW_TASK_CLOCK: /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. */ - if (counter->ctx->task) + if (event->ctx->task) pmu = &perf_ops_task_clock; else pmu = &perf_ops_cpu_clock; @@ -4077,9 +4186,9 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_PAGE_FAULTS_MAJ: case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event]); - counter->destroy = sw_perf_counter_destroy; + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; } pmu = &perf_ops_generic; break; @@ -4089,62 +4198,62 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) } /* - * Allocate and initialize a counter structure + * Allocate and initialize a event structure */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, gfp_t gfpflags) { const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; + struct perf_event *event; + struct hw_perf_event *hwc; long err; - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) + event = kzalloc(sizeof(*event), gfpflags); + if (!event) return ERR_PTR(-ENOMEM); /* - * Single counters are their own group leaders, with an + * Single events are their own group leaders, with an * empty sibling list: */ if (!group_leader) - group_leader = counter; + group_leader = event; - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); - INIT_LIST_HEAD(&counter->list_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); - mutex_init(&counter->mmap_mutex); + mutex_init(&event->mmap_mutex); - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; - counter->parent = parent_counter; + event->parent = parent_event; - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); - counter->state = PERF_COUNTER_STATE_INACTIVE; + event->state = PERF_EVENT_STATE_INACTIVE; if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; + event->state = PERF_EVENT_STATE_OFF; pmu = NULL; - hwc = &counter->hw; + hwc = &event->hw; hwc->sample_period = attr->sample_period; if (attr->freq && attr->sample_freq) hwc->sample_period = 1; @@ -4153,7 +4262,7 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited events */ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; @@ -4162,15 +4271,15 @@ perf_counter_alloc(struct perf_counter_attr *attr, case PERF_TYPE_RAW: case PERF_TYPE_HARDWARE: case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); + pmu = hw_perf_event_init(event); break; case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); + pmu = sw_perf_event_init(event); break; case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); + pmu = tp_perf_event_init(event); break; default: @@ -4184,29 +4293,29 @@ done: err = PTR_ERR(pmu); if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); return ERR_PTR(err); } - counter->pmu = pmu; + event->pmu = pmu; - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); } - return counter; + return event; } -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) { u32 size; int ret; @@ -4285,11 +4394,11 @@ err_size: goto out; } -int perf_counter_set_output(struct perf_counter *counter, int output_fd) +int perf_event_set_output(struct perf_event *event, int output_fd) { - struct perf_counter *output_counter = NULL; + struct perf_event *output_event = NULL; struct file *output_file = NULL; - struct perf_counter *old_output; + struct perf_event *old_output; int fput_needed = 0; int ret = -EINVAL; @@ -4303,28 +4412,28 @@ int perf_counter_set_output(struct perf_counter *counter, int output_fd) if (output_file->f_op != &perf_fops) goto out; - output_counter = output_file->private_data; + output_event = output_file->private_data; /* Don't chain output fds */ - if (output_counter->output) + if (output_event->output) goto out; /* Don't set an output fd when we already have an output channel */ - if (counter->data) + if (event->data) goto out; atomic_long_inc(&output_file->f_count); set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); if (old_output) { /* * we need to make sure no existing perf_output_*() - * is still referencing this counter. + * is still referencing this event. */ synchronize_rcu(); fput(old_output->filp); @@ -4337,21 +4446,21 @@ out: } /** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu + * sys_perf_event_open - open a performance event, associate it to a task/cpu * - * @attr_uptr: event type attributes for monitoring/sampling + * @attr_uptr: event_id type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu - * @group_fd: group leader counter fd + * @group_fd: group leader event fd */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; struct file *group_file = NULL; int fput_needed = 0; int fput_needed2 = 0; @@ -4371,7 +4480,7 @@ SYSCALL_DEFINE5(perf_counter_open, } if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) + if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; } @@ -4383,7 +4492,7 @@ SYSCALL_DEFINE5(perf_counter_open, return PTR_ERR(ctx); /* - * Look up the group leader (we will attach this counter to it): + * Look up the group leader (we will attach this event to it): */ group_leader = NULL; if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { @@ -4414,45 +4523,45 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_put_context; } - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, + event = perf_event_alloc(&attr, cpu, ctx, group_leader, NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) + err = PTR_ERR(event); + if (IS_ERR(event)) goto err_put_context; - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); if (err < 0) goto err_free_put_context; - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) + event_file = fget_light(err, &fput_needed2); + if (!event_file) goto err_free_put_context; if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); + err = perf_event_set_output(event, group_fd); if (err) goto err_fput_free_put_context; } - counter->filp = counter_file; + event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); + perf_install_in_context(ctx, event, cpu); ++ctx->generation; mutex_unlock(&ctx->mutex); - counter->owner = current; + event->owner = current; get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); err_fput_free_put_context: - fput_light(counter_file, fput_needed2); + fput_light(event_file, fput_needed2); err_free_put_context: if (err < 0) - kfree(counter); + kfree(event); err_put_context: if (err < 0) @@ -4464,88 +4573,88 @@ err_put_context: } /* - * inherit a counter from parent task to child task: + * inherit a event from parent task to child task: */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, +static struct perf_event * +inherit_event(struct perf_event *parent_event, struct task_struct *parent, - struct perf_counter_context *parent_ctx, + struct perf_event_context *parent_ctx, struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) + struct perf_event *group_leader, + struct perf_event_context *child_ctx) { - struct perf_counter *child_counter; + struct perf_event *child_event; /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, * which has a filp for sure, which we use as the reference * count: */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; + if (parent_event->parent) + parent_event = parent_event->parent; - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; + if (IS_ERR(child_event)) + return child_event; get_ctx(child_ctx); /* - * Make the child state follow the state of the parent counter, + * Make the child state follow the state of the parent event, * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. + * so we won't race with perf_event_{en, dis}able_family. */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; else - child_counter->state = PERF_COUNTER_STATE_OFF; + child_event->state = PERF_EVENT_STATE_OFF; - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; /* * Link it up in the child's context: */ - add_counter_to_ctx(child_counter, child_ctx); + add_event_to_ctx(child_event, child_ctx); /* * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because + * when the child event exits. This is safe to do because * we are in the parent and we know that the filp still * exists and has a nonzero count: */ - atomic_long_inc(&parent_counter->filp->f_count); + atomic_long_inc(&parent_event->filp->f_count); /* - * Link this into the parent counter's child list + * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); - return child_counter; + return child_event; } -static int inherit_group(struct perf_counter *parent_counter, +static int inherit_group(struct perf_event *parent_event, struct task_struct *parent, - struct perf_counter_context *parent_ctx, + struct perf_event_context *parent_ctx, struct task_struct *child, - struct perf_counter_context *child_ctx) + struct perf_event_context *child_ctx) { - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; - leader = inherit_counter(parent_counter, parent, parent_ctx, + leader = inherit_event(parent_event, parent, parent_ctx, child, NULL, child_ctx); if (IS_ERR(leader)) return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, child, leader, child_ctx); if (IS_ERR(child_ctr)) return PTR_ERR(child_ctr); @@ -4553,74 +4662,74 @@ static int inherit_group(struct perf_counter *parent_counter, return 0; } -static void sync_child_counter(struct perf_counter *child_counter, +static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { - struct perf_counter *parent_counter = child_counter->parent; + struct perf_event *parent_event = child_event->parent; u64 child_val; - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); - child_val = atomic64_read(&child_counter->count); + child_val = atomic64_read(&child_event->count); /* * Add back the child's count to the parent's count: */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); /* - * Remove this counter from the parent's list + * Remove this event from the parent's list */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); /* - * Release the parent counter, if this was the last + * Release the parent event, if this was the last * reference to it. */ - fput(parent_counter->filp); + fput(parent_event->filp); } static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, struct task_struct *child) { - struct perf_counter *parent_counter; + struct perf_event *parent_event; - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); + update_event_times(child_event); + perf_event_remove_from_context(child_event); - parent_counter = child_counter->parent; + parent_event = child_event->parent; /* - * It can happen that parent exits first, and has counters + * It can happen that parent exits first, and has events * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. + * events need to be zapped - but otherwise linger. */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); } } /* - * When a child task exits, feed back counter values to parent counters. + * When a child task exits, feed back event values to parent events. */ -void perf_counter_exit_task(struct task_struct *child) +void perf_event_exit_task(struct task_struct *child) { - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); return; } @@ -4631,37 +4740,37 @@ void perf_counter_exit_task(struct task_struct *child) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); /* * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has + * reading child->perf_event_ctxp, we wait until it has * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; + child->perf_event_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all - * the counters from it. + * the events from it. */ unclone_ctx(child_ctx); spin_unlock_irqrestore(&child_ctx->lock, flags); /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. */ - perf_counter_task(child, child_ctx, 0); + perf_event_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) * perf_release() * mutex_lock(&ctx->mutex) * @@ -4670,16 +4779,16 @@ void perf_counter_exit_task(struct task_struct *child) mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, - list_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); /* - * If the last counter was a group counter, it will have appended all + * If the last event was a group event, it will have appended all * its siblings to the list, but we obtained 'tmp' before that which * will still point to the list head terminating the iteration. */ - if (!list_empty(&child_ctx->counter_list)) + if (!list_empty(&child_ctx->group_list)) goto again; mutex_unlock(&child_ctx->mutex); @@ -4691,33 +4800,33 @@ again: * free an unexposed, unused context as created by inheritance by * init_task below, used by fork() in case of fail. */ -void perf_counter_free_task(struct task_struct *task) +void perf_event_free_task(struct task_struct *task) { - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; if (!ctx) return; mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { - struct perf_counter *parent = counter->parent; + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; if (WARN_ON_ONCE(!parent)) continue; mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); + list_del_init(&event->child_list); mutex_unlock(&parent->child_mutex); fput(parent->filp); - list_del_counter(counter, ctx); - free_counter(counter); + list_del_event(event, ctx); + free_event(event); } - if (!list_empty(&ctx->counter_list)) + if (!list_empty(&ctx->group_list)) goto again; mutex_unlock(&ctx->mutex); @@ -4726,37 +4835,37 @@ again: } /* - * Initialize the perf_counter context in task_struct + * Initialize the perf_event context in task_struct */ -int perf_counter_init_task(struct task_struct *child) +int perf_event_init_task(struct task_struct *child) { - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; struct task_struct *parent = current; int inherited_all = 1; int ret = 0; - child->perf_counter_ctxp = NULL; + child->perf_event_ctxp = NULL; - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_counter_ctxp)) + if (likely(!parent->perf_event_ctxp)) return 0; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. + * events that have been marked for cloning. * First allocate and initialize a context for the child. */ - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!child_ctx) return -ENOMEM; - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; get_task_struct(child); /* @@ -4782,16 +4891,14 @@ int perf_counter_init_task(struct task_struct *child) * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; + list_for_each_entry(event, &parent_ctx->group_list, group_entry) { - if (!counter->attr.inherit) { + if (!event->attr.inherit) { inherited_all = 0; continue; } - ret = inherit_group(counter, parent, parent_ctx, + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { inherited_all = 0; @@ -4805,7 +4912,7 @@ int perf_counter_init_task(struct task_struct *child) * context, or of whatever the parent is a clone of. * Note that if the parent is a clone, it could get * uncloned at any point, but that doesn't matter - * because the list of counters and the generation + * because the list of events and the generation * count can't have changed since we took the mutex. */ cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); @@ -4826,41 +4933,41 @@ int perf_counter_init_task(struct task_struct *child) return ret; } -static void __cpuinit perf_counter_init_cpu(int cpu) +static void __cpuinit perf_event_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); + __perf_event_init_context(&cpuctx->ctx, NULL); spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); - hw_perf_counter_setup(cpu); + hw_perf_event_setup(cpu); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) +static void __perf_event_exit_cpu(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; - list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) - __perf_counter_remove_from_context(counter); + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); } -static void perf_counter_exit_cpu(int cpu) +static void perf_event_exit_cpu(int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_event_context *ctx = &cpuctx->ctx; mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); mutex_unlock(&ctx->mutex); } #else -static inline void perf_counter_exit_cpu(int cpu) { } +static inline void perf_event_exit_cpu(int cpu) { } #endif static int __cpuinit @@ -4872,17 +4979,17 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); + perf_event_init_cpu(cpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); + hw_perf_event_setup_online(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); + perf_event_exit_cpu(cpu); break; default: @@ -4900,7 +5007,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .priority = 20, }; -void __init perf_counter_init(void) +void __init perf_event_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); @@ -4926,7 +5033,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, err = strict_strtoul(buf, 10, &val); if (err) return err; - if (val > perf_max_counters) + if (val > perf_max_events) return -EINVAL; spin_lock(&perf_resource_lock); @@ -4934,8 +5041,8 @@ perf_set_reserve_percpu(struct sysdev_class *class, for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); cpuctx->max_pertask = mpt; spin_unlock_irq(&cpuctx->ctx.lock); } @@ -4990,12 +5097,12 @@ static struct attribute *perfclass_attrs[] = { static struct attribute_group perfclass_attr_group = { .attrs = perfclass_attrs, - .name = "perf_counters", + .name = "perf_events", }; -static int __init perf_counter_sysfs_init(void) +static int __init perf_event_sysfs_init(void) { return sysfs_create_group(&cpu_sysdev_class.kset.kobj, &perfclass_attr_group); } -device_initcall(perf_counter_sysfs_init); +device_initcall(perf_event_sysfs_init); diff --git a/kernel/pid.c b/kernel/pid.c index 31310b5d3f50..d3f722d20f9c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -40,7 +40,7 @@ #define pid_hashfn(nr, ns) \ hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) static struct hlist_head *pid_hash; -static int pidhash_shift; +static unsigned int pidhash_shift = 4; struct pid init_struct_pid = INIT_STRUCT_PID; int pid_max = PID_MAX_DEFAULT; @@ -499,19 +499,12 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) void __init pidhash_init(void) { int i, pidhash_size; - unsigned long megabytes = nr_kernel_pages >> (20 - PAGE_SHIFT); - pidhash_shift = max(4, fls(megabytes * 4)); - pidhash_shift = min(12, pidhash_shift); + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); pidhash_size = 1 << pidhash_shift; - printk("PID hash table entries: %d (order: %d, %Zd bytes)\n", - pidhash_size, pidhash_shift, - pidhash_size * sizeof(struct hlist_head)); - - pid_hash = alloc_bootmem(pidhash_size * sizeof(*(pid_hash))); - if (!pid_hash) - panic("Could not alloc pidhash!\n"); for (i = 0; i < pidhash_size; i++) INIT_HLIST_HEAD(&pid_hash[i]); } diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 821722ae58a7..86b3796b0436 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -118,7 +118,7 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old { if (!(flags & CLONE_NEWPID)) return get_pid_ns(old_ns); - if (flags & CLONE_THREAD) + if (flags & (CLONE_THREAD|CLONE_PARENT)) return ERR_PTR(-EINVAL); return create_pid_namespace(old_ns); } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index e33a21cb9407..5c9dc228747b 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -8,17 +8,18 @@ #include <linux/math64.h> #include <asm/uaccess.h> #include <linux/kernel_stat.h> +#include <trace/events/timer.h> /* * Called after updating RLIMIT_CPU to set timer expiration if necessary. */ void update_rlimit_cpu(unsigned long rlim_new) { - cputime_t cputime; + cputime_t cputime = secs_to_cputime(rlim_new); + struct signal_struct *const sig = current->signal; - cputime = secs_to_cputime(rlim_new); - if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || - cputime_gt(current->signal->it_prof_expires, cputime)) { + if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || + cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { spin_lock_irq(¤t->sighand->siglock); set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); spin_unlock_irq(¤t->sighand->siglock); @@ -542,6 +543,17 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) now); } +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +static inline int expires_le(cputime_t expires, cputime_t new_exp) +{ + return !cputime_eq(expires, cputime_zero) && + cputime_le(expires, new_exp); +} /* * Insert the timer on the appropriate list before any timers that * expire later. This must be called with the tasklist_lock held @@ -586,34 +598,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) */ if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + union cpu_time_count *exp = &nt->expires; + switch (CPUCLOCK_WHICH(timer->it_clock)) { default: BUG(); case CPUCLOCK_PROF: - if (cputime_eq(p->cputime_expires.prof_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.prof_exp, - nt->expires.cpu)) - p->cputime_expires.prof_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.prof_exp, + exp->cpu)) + p->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_VIRT: - if (cputime_eq(p->cputime_expires.virt_exp, - cputime_zero) || - cputime_gt(p->cputime_expires.virt_exp, - nt->expires.cpu)) - p->cputime_expires.virt_exp = - nt->expires.cpu; + if (expires_gt(p->cputime_expires.virt_exp, + exp->cpu)) + p->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_SCHED: if (p->cputime_expires.sched_exp == 0 || - p->cputime_expires.sched_exp > - nt->expires.sched) + p->cputime_expires.sched_exp > exp->sched) p->cputime_expires.sched_exp = - nt->expires.sched; + exp->sched; break; } } else { + struct signal_struct *const sig = p->signal; + union cpu_time_count *exp = &timer->it.cpu.expires; + /* * For a process timer, set the cached expiration time. */ @@ -621,30 +631,23 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) default: BUG(); case CPUCLOCK_VIRT: - if (!cputime_eq(p->signal->it_virt_expires, - cputime_zero) && - cputime_lt(p->signal->it_virt_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_VIRT].expires, + exp->cpu)) break; - p->signal->cputime_expires.virt_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.virt_exp = exp->cpu; break; case CPUCLOCK_PROF: - if (!cputime_eq(p->signal->it_prof_expires, - cputime_zero) && - cputime_lt(p->signal->it_prof_expires, - timer->it.cpu.expires.cpu)) + if (expires_le(sig->it[CPUCLOCK_PROF].expires, + exp->cpu)) break; - i = p->signal->rlim[RLIMIT_CPU].rlim_cur; + i = sig->rlim[RLIMIT_CPU].rlim_cur; if (i != RLIM_INFINITY && - i <= cputime_to_secs(timer->it.cpu.expires.cpu)) + i <= cputime_to_secs(exp->cpu)) break; - p->signal->cputime_expires.prof_exp = - timer->it.cpu.expires.cpu; + sig->cputime_expires.prof_exp = exp->cpu; break; case CPUCLOCK_SCHED: - p->signal->cputime_expires.sched_exp = - timer->it.cpu.expires.sched; + sig->cputime_expires.sched_exp = exp->sched; break; } } @@ -1071,6 +1074,40 @@ static void stop_process_timers(struct task_struct *tsk) spin_unlock_irqrestore(&cputimer->lock, flags); } +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); + it->error -= onecputick; + } + } else { + it->expires = cputime_zero; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + /* * Check for any per-thread CPU timers that have fired and move them * off the tsk->*_timers list onto the firing list. Per-thread timers @@ -1090,10 +1127,10 @@ static void check_process_timers(struct task_struct *tsk, * Don't sample the current process CPU clocks if there are no timers. */ if (list_empty(&timers[CPUCLOCK_PROF]) && - cputime_eq(sig->it_prof_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) && sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY && list_empty(&timers[CPUCLOCK_VIRT]) && - cputime_eq(sig->it_virt_expires, cputime_zero) && + cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { stop_process_timers(tsk); return; @@ -1153,38 +1190,11 @@ static void check_process_timers(struct task_struct *tsk, /* * Check for the special case process timers. */ - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - if (cputime_ge(ptime, sig->it_prof_expires)) { - /* ITIMER_PROF fires and reloads. */ - sig->it_prof_expires = sig->it_prof_incr; - if (!cputime_eq(sig->it_prof_expires, cputime_zero)) { - sig->it_prof_expires = cputime_add( - sig->it_prof_expires, ptime); - } - __group_send_sig_info(SIGPROF, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_prof_expires, cputime_zero) && - (cputime_eq(prof_expires, cputime_zero) || - cputime_lt(sig->it_prof_expires, prof_expires))) { - prof_expires = sig->it_prof_expires; - } - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - if (cputime_ge(utime, sig->it_virt_expires)) { - /* ITIMER_VIRTUAL fires and reloads. */ - sig->it_virt_expires = sig->it_virt_incr; - if (!cputime_eq(sig->it_virt_expires, cputime_zero)) { - sig->it_virt_expires = cputime_add( - sig->it_virt_expires, utime); - } - __group_send_sig_info(SIGVTALRM, SEND_SIG_PRIV, tsk); - } - if (!cputime_eq(sig->it_virt_expires, cputime_zero) && - (cputime_eq(virt_expires, cputime_zero) || - cputime_lt(sig->it_virt_expires, virt_expires))) { - virt_expires = sig->it_virt_expires; - } - } + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); cputime_t x; @@ -1457,7 +1467,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, if (!cputime_eq(*oldval, cputime_zero)) { if (cputime_le(*oldval, now.cpu)) { /* Just about to fire. */ - *oldval = jiffies_to_cputime(1); + *oldval = cputime_one_jiffy; } else { *oldval = cputime_sub(*oldval, now.cpu); } @@ -1703,10 +1713,15 @@ static __init int init_posix_cpu_timers(void) .nsleep = thread_cpu_nsleep, .nsleep_restart = thread_cpu_nsleep_restart, }; + struct timespec ts; register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + return 0; } __initcall(init_posix_cpu_timers); diff --git a/kernel/power/process.c b/kernel/power/process.c index da2072d73811..cc2e55373b68 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -9,6 +9,7 @@ #undef DEBUG #include <linux/interrupt.h> +#include <linux/oom.h> #include <linux/suspend.h> #include <linux/module.h> #include <linux/syscalls.h> diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 97955b0e44f4..36cb168e4330 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c86d48..b101cdc4df3f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include <linux/module.h> #include <linux/file.h> -#include <linux/utsname.h> #include <linux/delay.h> #include <linux/bitops.h> #include <linux/genhd.h> diff --git a/kernel/printk.c b/kernel/printk.c index 602033acd6c7..f38b07f78a4e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -206,12 +206,11 @@ __setup("log_buf_len=", log_buf_len_setup); #ifdef CONFIG_BOOT_PRINTK_DELAY static unsigned int boot_delay; /* msecs delay after each printk during bootup */ -static unsigned long long printk_delay_msec; /* per msec, based on boot_delay */ +static unsigned long long loops_per_msec; /* based on boot_delay */ static int __init boot_delay_setup(char *str) { unsigned long lpj; - unsigned long long loops_per_msec; lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ loops_per_msec = (unsigned long long)lpj / 1000 * HZ; @@ -220,10 +219,9 @@ static int __init boot_delay_setup(char *str) if (boot_delay > 10 * 1000) boot_delay = 0; - printk_delay_msec = loops_per_msec; - printk(KERN_DEBUG "boot_delay: %u, preset_lpj: %ld, lpj: %lu, " - "HZ: %d, printk_delay_msec: %llu\n", - boot_delay, preset_lpj, lpj, HZ, printk_delay_msec); + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); return 1; } __setup("boot_delay=", boot_delay_setup); @@ -236,7 +234,7 @@ static void boot_delay_msec(void) if (boot_delay == 0 || system_state != SYSTEM_BOOTING) return; - k = (unsigned long long)printk_delay_msec * boot_delay; + k = (unsigned long long)loops_per_msec * boot_delay; timeout = jiffies + msecs_to_jiffies(boot_delay); while (k) { @@ -655,6 +653,20 @@ static int recursion_bug; static int new_text_line = 1; static char printk_buf[1024]; +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + asmlinkage int vprintk(const char *fmt, va_list args) { int printed_len = 0; @@ -664,6 +676,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) char *p; boot_delay_msec(); + printk_delay(); preempt_disable(); /* This stops the holder of console_sem just where we want him */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 307c285af59e..23bd09cd042e 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -266,9 +266,10 @@ static int ignoring_children(struct sighand_struct *sigh) * or self-reaping. Do notification now if it would have happened earlier. * If it should reap itself, return true. * - * If it's our own child, there is no notification to do. - * But if our normal children self-reap, then this child - * was prevented by ptrace and we must reap it now. + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). */ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) { @@ -278,8 +279,10 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) if (!task_detached(p) && thread_group_empty(p)) { if (!same_thread_group(p->real_parent, tracer)) do_notify_parent(p, p->exit_signal); - else if (ignoring_children(tracer->sighand)) + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); p->exit_signal = -1; + } } if (task_detached(p)) { /* Mark it as in the process of being reaped. */ diff --git a/kernel/relay.c b/kernel/relay.c index bc188549788f..760c26209a3c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -60,7 +60,7 @@ static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* * vm_ops for relay file mappings. */ -static struct vm_operations_struct relay_file_mmap_ops = { +static const struct vm_operations_struct relay_file_mmap_ops = { .fault = relay_buf_fault, .close = relay_file_mmap_close, }; diff --git a/kernel/res_counter.c b/kernel/res_counter.c index e1338f074314..bcdabf37c40b 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -19,6 +19,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; counter->parent = parent; } @@ -101,6 +102,8 @@ res_counter_member(struct res_counter *counter, int member) return &counter->limit; case RES_FAILCNT: return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; }; BUG(); diff --git a/kernel/resource.c b/kernel/resource.c index 78b087221c15..fb11a58b9594 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -223,13 +223,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if defined(CONFIG_MEMORY_HOTPLUG) && !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* * Finds the lowest memory reosurce exists within [res->start.res->end) - * the caller must specify res->start, res->end, res->flags. + * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. */ -static int find_next_system_ram(struct resource *res) +static int find_next_system_ram(struct resource *res, char *name) { resource_size_t start, end; struct resource *p; @@ -245,6 +245,8 @@ static int find_next_system_ram(struct resource *res) /* system ram is just marked as IORESOURCE_MEM */ if (p->flags != res->flags) continue; + if (name && strcmp(p->name, name)) + continue; if (p->start > end) { p = NULL; break; @@ -262,19 +264,26 @@ static int find_next_system_ram(struct resource *res) res->end = p->end; return 0; } -int -walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, - int (*func)(unsigned long, unsigned long, void *)) + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". + */ +int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; unsigned long pfn, len; u64 orig_end; int ret = -1; + res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; - while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { + while ((res.start < res.end) && + (find_next_system_ram(&res, "System RAM") >= 0)) { pfn = (unsigned long)(res.start >> PAGE_SHIFT); len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); ret = (*func)(pfn, len, arg); diff --git a/kernel/sched.c b/kernel/sched.c index 830967e18285..76c0e9691fc0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include <linux/completion.h> #include <linux/kernel_stat.h> #include <linux/debug_locks.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/security.h> #include <linux/notifier.h> #include <linux/profile.h> @@ -780,7 +780,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) return single_open(filp, sched_feat_show, NULL); } -static struct file_operations sched_feat_fops = { +static const struct file_operations sched_feat_fops = { .open = sched_feat_open, .write = sched_feat_write, .read = seq_read, @@ -2053,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2515,22 +2515,17 @@ void sched_fork(struct task_struct *p, int clone_flags) __sched_fork(p); /* - * Make sure we do not leak PI boosting priority to the child. - */ - p->prio = current->normal_prio; - - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { p->policy = SCHED_NORMAL; - - if (p->normal_prio < DEFAULT_PRIO) - p->prio = DEFAULT_PRIO; + p->normal_prio = p->static_prio; + } if (PRIO_TO_NICE(p->static_prio) < 0) { p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; set_load_weight(p); } @@ -2541,6 +2536,11 @@ void sched_fork(struct task_struct *p, int clone_flags) p->sched_reset_on_fork = 0; } + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; @@ -2581,8 +2581,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) BUG_ON(p->state != TASK_RUNNING); update_rq_clock(rq); - p->prio = effective_prio(p); - if (!p->sched_class->task_new || !current->se.on_rq) { activate_task(rq, p, 0); } else { @@ -2718,7 +2716,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -2904,6 +2902,19 @@ unsigned long nr_iowait(void) return sum; } +unsigned long nr_iowait_cpu(void) +{ + struct rq *this = this_rq(); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; @@ -5079,17 +5090,16 @@ void account_idle_time(cputime_t cputime) */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy = jiffies_to_cputime(1); - cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); if (user_tick) - account_user_time(p, one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, one_jiffy_scaled); else - account_idle_time(one_jiffy); + account_idle_time(cputime_one_jiffy); } /* @@ -5193,7 +5203,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5409,7 +5419,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -7671,7 +7681,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -9528,7 +9538,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } @@ -10300,7 +10310,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10311,7 +10321,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); @@ -10365,8 +10375,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) } static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk) +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) @@ -10376,15 +10385,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, if (tsk->sched_class != &fair_sched_class) return -EINVAL; #endif + return 0; +} +static int +cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup) +{ + int retval = cpu_cgroup_can_attach_task(cgrp, tsk); + if (retval) + return retval; + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + retval = cpu_cgroup_can_attach_task(cgrp, c); + if (retval) { + rcu_read_unlock(); + return retval; + } + } + rcu_read_unlock(); + } return 0; } static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk) + struct cgroup *old_cont, struct task_struct *tsk, + bool threadgroup) { sched_move_task(tsk); + if (threadgroup) { + struct task_struct *c; + rcu_read_lock(); + list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { + sched_move_task(c); + } + rcu_read_unlock(); + } } #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ac2e1dc708bd..479ce5682d7c 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -127,7 +127,7 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (cmpxchg(&scd->clock, old_clock, clock) != old_clock) + if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) goto again; return clock; @@ -163,7 +163,7 @@ again: val = remote_clock; } - if (cmpxchg(ptr, old_val, val) != old_val) + if (cmpxchg64(ptr, old_val, val) != old_val) goto again; return val; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ecc637a0d591..4e777b47eeda 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; diff --git a/kernel/signal.c b/kernel/signal.c index 64c5deeaca5d..6705320784fd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -705,7 +705,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) if (why) { /* - * The first thread which returns from finish_stop() + * The first thread which returns from do_signal_stop() * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ @@ -971,6 +971,20 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) return send_signal(sig, info, t, 0); } +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + /* * Force a signal that the process can't ignore: if necessary * we unblock the signal and change any SIG_IGN to SIG_DFL. @@ -1036,12 +1050,6 @@ void zap_other_threads(struct task_struct *p) } } -int __fatal_signal_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL); -} -EXPORT_SYMBOL(__fatal_signal_pending); - struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) { struct sighand_struct *sighand; @@ -1068,18 +1076,10 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - unsigned long flags; - int ret; + int ret = check_kill_permission(sig, info, p); - ret = check_kill_permission(sig, info, p); - - if (!ret && sig) { - ret = -ESRCH; - if (lock_task_sighand(p, &flags)) { - ret = __group_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); - } - } + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); return ret; } @@ -1224,15 +1224,9 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid) * These are for backward compatibility with the rest of the kernel source. */ -/* - * The caller must ensure the task can't exit. - */ int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret; - unsigned long flags; - /* * Make sure legacy kernel users don't send in bad values * (normal paths check this in check_kill_permission). @@ -1240,10 +1234,7 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p) if (!valid_signal(sig)) return -EINVAL; - spin_lock_irqsave(&p->sighand->siglock, flags); - ret = specific_send_sig_info(sig, info, p); - spin_unlock_irqrestore(&p->sighand->siglock, flags); - return ret; + return do_send_sig_info(sig, info, p, false); } #define __si_special(priv) \ @@ -1383,15 +1374,6 @@ ret: } /* - * Wake up any threads in the parent blocked in wait* syscalls. - */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) -{ - wake_up_interruptible_sync(&parent->signal->wait_chldexit); -} - -/* * Let a parent know about the death of a child. * For a stopped/continued status change, use do_notify_parent_cldstop instead. * @@ -1673,29 +1655,6 @@ void ptrace_notify(int exit_code) spin_unlock_irq(¤t->sighand->siglock); } -static void -finish_stop(int stop_count) -{ - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, - * report to the parent. When ptraced, every thread reports itself. - */ - if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, CLD_STOPPED); - read_unlock(&tasklist_lock); - } - - do { - schedule(); - } while (try_to_freeze()); - /* - * Now we don't run again until continued. - */ - current->exit_code = 0; -} - /* * This performs the stopping for SIGSTOP and other stop signals. * We have to stop all threads in the thread group. @@ -1705,15 +1664,9 @@ finish_stop(int stop_count) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int stop_count; + int notify; - if (sig->group_stop_count > 0) { - /* - * There is a group stop in progress. We don't need to - * start another one. - */ - stop_count = --sig->group_stop_count; - } else { + if (!sig->group_stop_count) { struct task_struct *t; if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || @@ -1725,7 +1678,7 @@ static int do_signal_stop(int signr) */ sig->group_exit_code = signr; - stop_count = 0; + sig->group_stop_count = 1; for (t = next_thread(current); t != current; t = next_thread(t)) /* * Setting state to TASK_STOPPED for a group @@ -1734,19 +1687,44 @@ static int do_signal_stop(int signr) */ if (!(t->flags & PF_EXITING) && !task_is_stopped_or_traced(t)) { - stop_count++; + sig->group_stop_count++; signal_wake_up(t, 0); } - sig->group_stop_count = stop_count; } + /* + * If there are no other threads in the group, or if there is + * a group stop in progress and we are the last to stop, report + * to the parent. When ptraced, every thread reports itself. + */ + notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; + notify = tracehook_notify_jctl(notify, CLD_STOPPED); + /* + * tracehook_notify_jctl() can drop and reacquire siglock, so + * we keep ->group_stop_count != 0 before the call. If SIGCONT + * or SIGKILL comes in between ->group_stop_count == 0. + */ + if (sig->group_stop_count) { + if (!--sig->group_stop_count) + sig->flags = SIGNAL_STOP_STOPPED; + current->exit_code = sig->group_exit_code; + __set_current_state(TASK_STOPPED); + } + spin_unlock_irq(¤t->sighand->siglock); - if (stop_count == 0) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; - __set_current_state(TASK_STOPPED); + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + do { + schedule(); + } while (try_to_freeze()); + + tracehook_finish_jctl(); + current->exit_code = 0; - spin_unlock_irq(¤t->sighand->siglock); - finish_stop(stop_count); return 1; } @@ -1815,14 +1793,15 @@ relock: int why = (signal->flags & SIGNAL_STOP_CONTINUED) ? CLD_CONTINUED : CLD_STOPPED; signal->flags &= ~SIGNAL_CLD_MASK; - spin_unlock_irq(&sighand->siglock); - if (unlikely(!tracehook_notify_jctl(1, why))) - goto relock; + why = tracehook_notify_jctl(why, CLD_CONTINUED); + spin_unlock_irq(&sighand->siglock); - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); + if (why) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current->group_leader, why); + read_unlock(&tasklist_lock); + } goto relock; } @@ -1987,14 +1966,14 @@ void exit_signals(struct task_struct *tsk) if (unlikely(tsk->signal->group_stop_count) && !--tsk->signal->group_stop_count) { tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = 1; + group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); } out: spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) { + if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, CLD_STOPPED); + do_notify_parent_cldstop(tsk, group_stop); read_unlock(&tasklist_lock); } } @@ -2290,7 +2269,6 @@ static int do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { struct task_struct *p; - unsigned long flags; int error = -ESRCH; rcu_read_lock(); @@ -2300,14 +2278,16 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. - * - * If lock_task_sighand() fails we pretend the task dies - * after receiving the signal. The window is tiny, and the - * signal is private anyway. */ - if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, info, p); - unlock_task_sighand(p, &flags); + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; } } rcu_read_unlock(); diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519557d3..0d31135efbf4 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { diff --git a/kernel/smp.c b/kernel/smp.c index 8e218500ab14..c9d1c7835c2f 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -29,8 +29,7 @@ enum { struct call_function_data { struct call_single_data csd; - spinlock_t lock; - unsigned int refs; + atomic_t refs; cpumask_var_t cpumask; }; @@ -39,9 +38,7 @@ struct call_single_queue { spinlock_t lock; }; -static DEFINE_PER_CPU(struct call_function_data, cfd_data) = { - .lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock), -}; +static DEFINE_PER_CPU(struct call_function_data, cfd_data); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -196,25 +193,18 @@ void generic_smp_call_function_interrupt(void) list_for_each_entry_rcu(data, &call_function.queue, csd.list) { int refs; - spin_lock(&data->lock); - if (!cpumask_test_cpu(cpu, data->cpumask)) { - spin_unlock(&data->lock); + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) continue; - } - cpumask_clear_cpu(cpu, data->cpumask); - spin_unlock(&data->lock); data->csd.func(data->csd.info); - spin_lock(&data->lock); - WARN_ON(data->refs == 0); - refs = --data->refs; + refs = atomic_dec_return(&data->refs); + WARN_ON(refs < 0); if (!refs) { spin_lock(&call_function.lock); list_del_rcu(&data->csd.list); spin_unlock(&call_function.lock); } - spin_unlock(&data->lock); if (refs) continue; @@ -357,13 +347,6 @@ void __smp_call_function_single(int cpu, struct call_single_data *data, generic_exec_single(cpu, data, wait); } -/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */ - -#ifndef arch_send_call_function_ipi_mask -# define arch_send_call_function_ipi_mask(maskp) \ - arch_send_call_function_ipi(*(maskp)) -#endif - /** * smp_call_function_many(): Run a function on a set of other CPUs. * @mask: The set of cpus to run on (only runs on online subset). @@ -419,23 +402,20 @@ void smp_call_function_many(const struct cpumask *mask, data = &__get_cpu_var(cfd_data); csd_lock(&data->csd); - spin_lock_irqsave(&data->lock, flags); data->csd.func = func; data->csd.info = info; cpumask_and(data->cpumask, mask, cpu_online_mask); cpumask_clear_cpu(this_cpu, data->cpumask); - data->refs = cpumask_weight(data->cpumask); + atomic_set(&data->refs, cpumask_weight(data->cpumask)); - spin_lock(&call_function.lock); + spin_lock_irqsave(&call_function.lock, flags); /* * Place entry at the _HEAD_ of the list, so that any cpu still * observing the entry in generic_smp_call_function_interrupt() * will not miss any other list entries: */ list_add_rcu(&data->csd.list, &call_function.queue); - spin_unlock(&call_function.lock); - - spin_unlock_irqrestore(&data->lock, flags); + spin_unlock_irqrestore(&call_function.lock, flags); /* * Make the list addition visible before sending the ipi. diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c330838..81324d12eb35 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097c76fa..255475d163e0 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include <linux/prctl.h> #include <linux/highuid.h> #include <linux/fs.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/resource.h> #include <linux/kernel.h> #include <linux/kexec.h> @@ -1338,6 +1338,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) unsigned long flags; cputime_t utime, stime; struct task_cputime cputime; + unsigned long maxrss = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; @@ -1346,6 +1347,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) utime = task_utime(current); stime = task_stime(current); accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; goto out; } @@ -1363,6 +1365,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt = p->signal->cmaj_flt; r->ru_inblock = p->signal->cinblock; r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; if (who == RUSAGE_CHILDREN) break; @@ -1377,6 +1380,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) r->ru_majflt += p->signal->maj_flt; r->ru_inblock += p->signal->inblock; r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; t = p; do { accumulate_thread_rusage(t, r); @@ -1392,6 +1397,15 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) out: cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ } int getrusage(struct task_struct *p, int who, struct rusage __user *ru) @@ -1511,11 +1525,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; @@ -1528,6 +1542,28 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, current->timer_slack_ns = arg2; error = 0; break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case 0: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case 1: + current->flags |= PF_MCE_PROCESS; + if (arg3 != 0) + current->flags |= PF_MCE_EARLY; + else + current->flags &= ~PF_MCE_EARLY; + break; + default: + return -EINVAL; + } + error = 0; + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6b07b5..e06d0b8d1951 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -49,6 +49,7 @@ cond_syscall(sys_sendmsg); cond_syscall(compat_sys_sendmsg); cond_syscall(sys_recvmsg); cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recvfrom); cond_syscall(sys_socketcall); cond_syscall(sys_futex); cond_syscall(compat_sys_futex); @@ -177,4 +178,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a631ba684a4..0d949c517412 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include <linux/proc_fs.h> #include <linux/security.h> #include <linux/ctype.h> -#include <linux/utsname.h> #include <linux/kmemcheck.h> #include <linux/smp_lock.h> #include <linux/fs.h> @@ -50,7 +49,7 @@ #include <linux/reboot.h> #include <linux/ftrace.h> #include <linux/slow-work.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -77,6 +76,7 @@ extern int max_threads; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; +extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; extern int pid_max_min, pid_max_max; @@ -106,6 +106,9 @@ static int __maybe_unused one = 1; static int __maybe_unused two = 2; static unsigned long one_ul = 1; static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; @@ -160,9 +163,9 @@ extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -421,6 +424,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", @@ -722,6 +733,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, #endif { .ctl_name = KERN_NGROUPS_MAX, @@ -964,28 +986,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, @@ -1376,6 +1398,31 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, +#ifdef CONFIG_MEMORY_FAILURE + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt @@ -2204,7 +2251,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2265,7 +2312,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2279,10 +2325,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2307,7 +2353,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2414,13 +2460,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2428,7 +2474,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2438,10 +2483,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2449,7 +2494,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2461,7 +2506,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2513,7 +2558,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2526,19 +2570,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2643,21 +2686,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2670,17 +2711,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2695,11 +2735,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2775,7 +2814,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2787,10 +2825,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2798,7 +2836,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2810,10 +2847,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2821,7 +2858,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2834,14 +2870,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2850,7 +2886,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2865,50 +2901,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 0b0a6366c9d4..ee266620b06c 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -1,4 +1,4 @@ -obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 09113347d328..5e18c6ab2c6a 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -394,15 +394,11 @@ void clocksource_resume(void) { struct clocksource *cs; - mutex_lock(&clocksource_mutex); - list_for_each_entry(cs, &clocksource_list, list) if (cs->resume) cs->resume(); clocksource_resume_watchdog(); - - mutex_unlock(&clocksource_mutex); } /** diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e0f59a21c061..89aed5933ed4 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle) if (!inidle && !ts->inidle) goto end; + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + now = tick_nohz_start_idle(ts); /* @@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle) if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) goto end; - ts->inidle = 1; - if (need_resched()) goto end; diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 000000000000..86628e755f38 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei<zhaolei@cn.fujitsu.com> + */ + +#include <linux/time.h> +#include <linux/module.h> + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fddd69d16e03..1b5b7aa2fdfd 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp) return single_open(filp, timer_list_show, NULL); } -static struct file_operations timer_list_fops = { +static const struct file_operations timer_list_fops = { .open = timer_list_open, .read = seq_read, .llseek = seq_lseek, diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 4cde8b9c716f..ee5681f8d7ec 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp) return single_open(filp, tstats_show, NULL); } -static struct file_operations tstats_fops = { +static const struct file_operations tstats_fops = { .open = tstats_open, .read = seq_read, .write = tstats_write, diff --git a/kernel/timer.c b/kernel/timer.c index bbb51074680e..5db5a8d26811 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <linux/sched.h> #include <asm/uaccess.h> @@ -46,6 +46,9 @@ #include <asm/timex.h> #include <asm/io.h> +#define CREATE_TRACE_POINTS +#include <trace/events/timer.h> + u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); @@ -521,6 +524,25 @@ static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } #endif +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + static void __init_timer(struct timer_list *timer, const char *name, struct lock_class_key *key) @@ -549,7 +571,7 @@ void init_timer_key(struct timer_list *timer, const char *name, struct lock_class_key *key) { - debug_timer_init(timer); + debug_init(timer); __init_timer(timer, name, key); } EXPORT_SYMBOL(init_timer_key); @@ -568,7 +590,7 @@ static inline void detach_timer(struct timer_list *timer, { struct list_head *entry = &timer->entry; - debug_timer_deactivate(timer); + debug_deactivate(timer); __list_del(entry->prev, entry->next); if (clear_pending) @@ -632,7 +654,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, goto out_unlock; } - debug_timer_activate(timer); + debug_activate(timer, expires); new_base = __get_cpu_var(tvec_bases); @@ -787,7 +809,7 @@ void add_timer_on(struct timer_list *timer, int cpu) BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); timer_set_base(timer, base); - debug_timer_activate(timer); + debug_activate(timer, timer->expires); if (time_before(timer->expires, base->next_timer) && !tbase_get_deferrable(timer->base)) base->next_timer = timer->expires; @@ -1000,7 +1022,9 @@ static inline void __run_timers(struct tvec_base *base) */ lock_map_acquire(&lockdep_map); + trace_timer_expire_entry(timer); fn(data); + trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); @@ -1187,7 +1211,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e71634604400..b416512ad17f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -83,7 +83,7 @@ config RING_BUFFER_ALLOW_SWAP # This allows those options to appear when no other tracer is selected. But the # options do not appear when something else selects it. We need the two options # GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the -# hidding of the automatic options options. +# hidding of the automatic options. config TRACING bool diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3eb159c277c8..d9d6206e0b14 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -856,6 +856,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, } /** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for * @rq: io request @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); @@ -1657,6 +1691,11 @@ int blk_trace_init_sysfs(struct device *dev) return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); } +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c71e91bf7372..37ba67e33265 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -225,7 +225,11 @@ static void ftrace_update_pid_func(void) if (ftrace_trace_function == ftrace_stub) return; +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST func = ftrace_trace_function; +#else + func = __ftrace_trace_function; +#endif if (ftrace_pid_trace) { set_ftrace_pid_function(func); @@ -1074,14 +1078,9 @@ static void ftrace_replace_code(int enable) failed = __ftrace_replace_code(rec, enable); if (failed) { rec->flags |= FTRACE_FL_FAILED; - if ((system_state == SYSTEM_BOOTING) || - !core_kernel_text(rec->ip)) { - ftrace_free_rec(rec); - } else { - ftrace_bug(failed, rec->ip); - /* Stop processing */ - return; - } + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; } } while_for_each_ftrace_rec(); } @@ -1520,7 +1519,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_ftrace_seq_ops = { +static const struct seq_operations show_ftrace_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -1621,8 +1620,10 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable) if (!ret) { struct seq_file *m = file->private_data; m->private = iter; - } else + } else { + trace_parser_put(&iter->parser); kfree(iter); + } } else file->private_data = iter; mutex_unlock(&ftrace_regex_lock); @@ -2202,7 +2203,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, struct trace_parser *parser; ssize_t ret, read; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&ftrace_regex_lock); @@ -2216,7 +2217,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, parser = &iter->parser; read = trace_get_user(parser, ubuf, cnt, ppos); - if (trace_parser_loaded(parser) && + if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { ret = ftrace_process_regex(parser->buffer, parser->idx, enable); @@ -2459,7 +2460,7 @@ static int g_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations ftrace_graph_seq_ops = { +static const struct seq_operations ftrace_graph_seq_ops = { .start = g_start, .next = g_next, .stop = g_stop, @@ -2552,8 +2553,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - size_t read = 0; - ssize_t ret; + ssize_t read, ret; if (!cnt || cnt < 0) return 0; @@ -2562,29 +2562,31 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { ret = -EBUSY; - goto out; + goto out_unlock; } if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; - goto out; + goto out_unlock; } read = trace_get_user(&parser, ubuf, cnt, ppos); - if (trace_parser_loaded((&parser))) { + if (read >= 0 && trace_parser_loaded((&parser))) { parser.buffer[parser.idx] = 0; /* we allow only one expression at a time */ ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, parser.buffer); if (ret) - goto out; + goto out_free; } ret = read; - out: + +out_free: trace_parser_put(&parser); +out_unlock: mutex_unlock(&graph_lock); return ret; @@ -2655,19 +2657,17 @@ static int ftrace_convert_nops(struct module *mod, } #ifdef CONFIG_MODULES -void ftrace_release(void *start, void *end) +void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = (unsigned long)end; - if (ftrace_disabled || !start || start == end) + if (ftrace_disabled) return; mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { + if (within_module_core(rec->ip, mod)) { /* * rec->ip is changed in ftrace_free_rec() * It should not between s and e if record was freed. @@ -2699,9 +2699,7 @@ static int ftrace_module_notify(struct notifier_block *self, mod->num_ftrace_callsites); break; case MODULE_STATE_GOING: - ftrace_release(mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + ftrace_release_mod(mod); break; } @@ -3015,7 +3013,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3025,7 +3023,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 81b1645c8549..a91da69f153a 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void) return 1; } - if (!register_tracer(&kmem_tracer)) { + if (register_tracer(&kmem_tracer) != 0) { pr_warning("Warning: could not register the kmem tracer\n"); return 1; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index a35925d222ba..45068269ebb1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -415,7 +415,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, /* read the non-space input */ while (cnt && !isspace(ch)) { - if (parser->idx < parser->size) + if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { ret = -EINVAL; @@ -1949,7 +1949,7 @@ static int s_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations tracer_seq_ops = { +static const struct seq_operations tracer_seq_ops = { .start = s_start, .next = s_next, .stop = s_stop, @@ -1984,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -2163,7 +2161,7 @@ static int t_show(struct seq_file *m, void *v) return 0; } -static struct seq_operations show_traces_seq_ops = { +static const struct seq_operations show_traces_seq_ops = { .start = t_start, .next = t_next, .stop = t_stop, @@ -4389,7 +4387,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4400,7 +4398,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 7a7a9fd249a9..4a194f08f88c 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; + struct ring_buffer *buffer; unsigned long flags; int cpu, pc; const char *p; @@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) goto out; pc = preempt_count(); - event = trace_buffer_lock_reserve(tr, TRACE_BRANCH, + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) goto out; @@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - if (!filter_check_discard(call, entry, tr->buffer, event)) - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index dd44b8768867..8d5c171cc998 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -31,7 +31,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) if (atomic_inc_return(&event->profile_count)) return 0; - if (!total_profile_count++) { + if (!total_profile_count) { buf = (char *)alloc_percpu(profile_buf_t); if (!buf) goto fail_buf; @@ -46,14 +46,19 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) } ret = event->profile_enable(); - if (!ret) + if (!ret) { + total_profile_count++; return 0; + } - kfree(trace_profile_buf_nmi); fail_buf_nmi: - kfree(trace_profile_buf); + if (!total_profile_count) { + free_percpu(trace_profile_buf_nmi); + free_percpu(trace_profile_buf); + trace_profile_buf_nmi = NULL; + trace_profile_buf = NULL; + } fail_buf: - total_profile_count--; atomic_dec(&event->profile_count); return ret; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6f03c8a1105e..d128f65778e6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -232,10 +232,9 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; - size_t read = 0; - ssize_t ret; + ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; ret = tracing_update_buffers(); @@ -247,7 +246,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, read = trace_get_user(&parser, ubuf, cnt, ppos); - if (trace_parser_loaded((&parser))) { + if (read >= 0 && trace_parser_loaded((&parser))) { int set = 1; if (*parser.buffer == '!') diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index ca7d7c4d0c2a..69543a905cd5 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -155,7 +155,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) seq_print_ip_sym(seq, it->from, symflags) && trace_seq_printf(seq, "\n")) return TRACE_TYPE_HANDLED; - return TRACE_TYPE_PARTIAL_LINE;; + return TRACE_TYPE_PARTIAL_LINE; } return TRACE_TYPE_UNHANDLED; } @@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to) struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; + struct ring_buffer *buf; struct hw_branch_entry *entry; unsigned long irq1; int cpu; @@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to) if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) goto out; - event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES, + buf = tr->buffer; + event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES, sizeof(*entry), 0, 0); if (!event) goto out; @@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - if (!filter_check_discard(call, entry, tr->buffer, event)) - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, buf, event)) + trace_buffer_unlock_commit(buf, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index f572f44c6e1e..ed17565826b0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -486,16 +486,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) hardirq ? 'h' : softirq ? 's' : '.')) return 0; - if (entry->lock_depth < 0) - ret = trace_seq_putc(s, '.'); + if (entry->preempt_count) + ret = trace_seq_printf(s, "%x", entry->preempt_count); else - ret = trace_seq_printf(s, "%d", entry->lock_depth); + ret = trace_seq_putc(s, '.'); + if (!ret) return 0; - if (entry->preempt_count) - return trace_seq_printf(s, "%x", entry->preempt_count); - return trace_seq_putc(s, '.'); + if (entry->lock_depth < 0) + return trace_seq_putc(s, '.'); + + return trace_seq_printf(s, "%d", entry->lock_depth); } static int @@ -883,7 +885,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", field->prev_pid, @@ -918,7 +920,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) trace_assign_type(field, iter->ent); if (!S) - task_state_char(field->prev_state); + S = task_state_char(field->prev_state); T = task_state_char(field->next_state); SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb050a1..8504ac71e4e8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a3550cf2597..527e17eae575 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include <trace/events/syscalls.h> #include <linux/kernel.h> #include <linux/ftrace.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/syscall.h> #include "trace_output.h" @@ -166,7 +166,7 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n", SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(unsigned long, ret)); + SYSCALL_FIELD(long, ret)); if (!ret) return 0; @@ -212,7 +212,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0, + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0, FILTER_OTHER); return ret; @@ -433,7 +433,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); end: local_irq_restore(flags); @@ -532,7 +532,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_tpcounter_event(sys_data->exit_id, 0, 1, rec, size); + perf_tp_event(sys_data->exit_id, 0, 1, rec, size); end: local_irq_restore(flags); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 9489a0a9b1be..cc89be5bc0f8 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -48,7 +48,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; /* * Note about RCU : - * It is used to to delay the free of multiple probes array until a quiescent + * It is used to delay the free of multiple probes array until a quiescent * state is reached. * Tracepoint entries modifications are protected by the tracepoints_mutex. */ diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501688b9..419209893d87 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include <linux/mm.h> -#include <linux/utsname.h> #include <linux/mman.h> #include <linux/notifier.h> #include <linux/reboot.h> diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc747a7..69eae358a726 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } |