diff options
Diffstat (limited to 'kernel')
40 files changed, 1830 insertions, 979 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 88c92fb44618..5068e2a4e75f 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE config MUTEX_SPIN_ON_OWNER - def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES + def_bool SMP && !DEBUG_MUTEXES diff --git a/kernel/Makefile b/kernel/Makefile index e9cf19155b46..2d64cfcc8b42 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -61,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o obj-$(CONFIG_CPUSETS) += cpuset.o -obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o obj-$(CONFIG_PID_NS) += pid_namespace.o diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b33513a08beb..00d79df03e76 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) /* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 - * otherwise. */ + * otherwise. + * + * If task_creation is true, this is an explicit indication that we are + * filtering a task rule at task creation time. This and tsk == current are + * the only situations where tsk->cred may be accessed without an rcu read lock. + */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, struct audit_names *name, - enum audit_state *state) + enum audit_state *state, + bool task_creation) { - const struct cred *cred = get_task_cred(tsk); + const struct cred *cred; int i, j, need_sid = 1; u32 sid; + cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); + for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; int result = 0; @@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk, break; } - if (!result) { - put_cred(cred); + if (!result) return 0; - } } if (ctx) { @@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_NEVER: *state = AUDIT_DISABLED; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } - put_cred(cred); return 1; } @@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, + &state, true)) { if (state == AUDIT_RECORD_CONTEXT) *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); rcu_read_unlock(); @@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, list_for_each_entry_rcu(e, list, list) { if ((e->rule.mask[word] & bit) == bit && audit_filter_rules(tsk, &e->rule, ctx, NULL, - &state)) { + &state, false)) { rcu_read_unlock(); ctx->current_state = state; return state; @@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) list_for_each_entry_rcu(e, list, list) { if ((e->rule.mask[word] & bit) == bit && - audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { + audit_filter_rules(tsk, &e->rule, ctx, n, + &state, false)) { rcu_read_unlock(); ctx->current_state = state; return; diff --git a/kernel/capability.c b/kernel/capability.c index 32a80e08ff4b..283c529f8b1c 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -22,12 +22,8 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; -const kernel_cap_t __cap_full_set = CAP_FULL_SET; -const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET; EXPORT_SYMBOL(__cap_empty_set); -EXPORT_SYMBOL(__cap_full_set); -EXPORT_SYMBOL(__cap_init_eff_set); int file_caps_enabled = 1; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 909a35510af5..2731d115d725 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,6 +57,7 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/eventfd.h> #include <linux/poll.h> +#include <linux/flex_array.h> /* used in cgroup_attach_proc */ #include <asm/atomic.h> @@ -1735,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } EXPORT_SYMBOL_GPL(cgroup_path); +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to @@ -1745,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path); */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { - int retval = 0; + int retval; struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; - struct css_set *cg; - struct css_set *newcg; struct cgroupfs_root *root = cgrp->root; /* Nothing to do if the task is already in that cgroup */ @@ -1759,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { - retval = ss->can_attach(ss, cgrp, tsk, false); + retval = ss->can_attach(ss, cgrp, tsk); if (retval) { /* * Remember on which subsystem the can_attach() @@ -1771,46 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } } - task_lock(tsk); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - /* - * Locate or allocate a new css_set for this task, - * based on its final set of cgroups - */ - newcg = find_css_set(cg, cgrp); - put_css_set(cg); - if (!newcg) { - retval = -ENOMEM; - goto out; - } - - task_lock(tsk); - if (tsk->flags & PF_EXITING) { - task_unlock(tsk); - put_css_set(newcg); - retval = -ESRCH; + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) goto out; - } - rcu_assign_pointer(tsk->cgroups, newcg); - task_unlock(tsk); - - /* Update the css_set linked lists if we're using them */ - write_lock(&css_set_lock); - if (!list_empty(&tsk->cg_list)) - list_move(&tsk->cg_list, &newcg->tasks); - write_unlock(&css_set_lock); for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); if (ss->attach) - ss->attach(ss, cgrp, oldcgrp, tsk, false); + ss->attach(ss, cgrp, oldcgrp, tsk); } - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + synchronize_rcu(); - put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -1829,7 +1881,7 @@ out: */ break; if (ss->cancel_attach) - ss->cancel_attach(ss, cgrp, tsk, false); + ss->cancel_attach(ss, cgrp, tsk); } } return retval; @@ -1860,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) EXPORT_SYMBOL_GPL(cgroup_attach_task_all); /* - * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex - * held. May take task_lock of task + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct flex_array *group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); + if (!group) + return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + /* ignore this task if it's going away */ + task_unlock(tsk); + continue; + } + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } +out_free_group_list: + flex_array_free(group); + return retval; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. */ -static int attach_task_by_pid(struct cgroup *cgrp, u64 pid) +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) { struct task_struct *tsk; const struct cred *cred = current_cred(), *tcred; int ret; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + if (pid) { rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || tsk->flags & PF_EXITING) { + if (!tsk) { rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ + rcu_read_unlock(); + cgroup_unlock(); return -ESRCH; } + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ tcred = __task_cred(tsk); if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { rcu_read_unlock(); + cgroup_unlock(); return -EACCES; } get_task_struct(tsk); rcu_read_unlock(); } else { - tsk = current; + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; get_task_struct(tsk); } - ret = cgroup_attach_task(cgrp, tsk); + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } put_task_struct(tsk); + cgroup_unlock(); return ret; } static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) { + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +{ int ret; - if (!cgroup_lock_live_group(cgrp)) - return -ENODEV; - ret = attach_task_by_pid(cgrp, pid); - cgroup_unlock(); + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); return ret; } @@ -3259,9 +3632,9 @@ static struct cftype files[] = { { .name = CGROUP_FILE_GENERIC_PREFIX "procs", .open = cgroup_procs_open, - /* .write_u64 = cgroup_procs_write, TODO */ + .write_u64 = cgroup_procs_write, .release = cgroup_pidlist_release, - .mode = S_IRUGO, + .mode = S_IRUGO | S_IWUSR, }, { .name = "notify_on_release", @@ -4257,122 +4630,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - clone the cgroup the given subsystem is attached to - * @tsk: the task to be moved - * @subsys: the given subsystem - * @nodename: the name for the new cgroup - * - * Duplicate the current cgroup in the hierarchy that the given - * subsystem is attached to, and move this task into the new - * child. - */ -int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys, - char *nodename) -{ - struct dentry *dentry; - int ret = 0; - struct cgroup *parent, *child; - struct inode *inode; - struct css_set *cg; - struct cgroupfs_root *root; - struct cgroup_subsys *ss; - - /* We shouldn't be called by an unregistered subsystem */ - BUG_ON(!subsys->active); - - /* First figure out what hierarchy and cgroup we're dealing - * with, and pin them so we can drop cgroup_mutex */ - mutex_lock(&cgroup_mutex); - again: - root = subsys->root; - if (root == &rootnode) { - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Pin the hierarchy */ - if (!atomic_inc_not_zero(&root->sb->s_active)) { - /* We race with the final deactivate_super() */ - mutex_unlock(&cgroup_mutex); - return 0; - } - - /* Keep the cgroup alive */ - task_lock(tsk); - parent = task_cgroup(tsk, subsys->subsys_id); - cg = tsk->cgroups; - get_css_set(cg); - task_unlock(tsk); - - mutex_unlock(&cgroup_mutex); - - /* Now do the VFS work to create a cgroup */ - inode = parent->dentry->d_inode; - - /* Hold the parent directory mutex across this operation to - * stop anyone else deleting the new cgroup */ - mutex_lock(&inode->i_mutex); - dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); - if (IS_ERR(dentry)) { - printk(KERN_INFO - "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, - PTR_ERR(dentry)); - ret = PTR_ERR(dentry); - goto out_release; - } - - /* Create the cgroup directory, which also creates the cgroup */ - ret = vfs_mkdir(inode, dentry, 0755); - child = __d_cgrp(dentry); - dput(dentry); - if (ret) { - printk(KERN_INFO - "Failed to create cgroup %s: %d\n", nodename, - ret); - goto out_release; - } - - /* The cgroup now exists. Retake cgroup_mutex and check - * that we're still in the same state that we thought we - * were. */ - mutex_lock(&cgroup_mutex); - if ((root != subsys->root) || - (parent != task_cgroup(tsk, subsys->subsys_id))) { - /* Aargh, we raced ... */ - mutex_unlock(&inode->i_mutex); - put_css_set(cg); - - deactivate_super(root->sb); - /* The cgroup is still accessible in the VFS, but - * we're not going to try to rmdir() it at this - * point. */ - printk(KERN_INFO - "Race in cgroup_clone() - leaking cgroup %s\n", - nodename); - goto again; - } - - /* do any required auto-setup */ - for_each_subsys(root, ss) { - if (ss->post_clone) - ss->post_clone(ss, child); - } - - /* All seems fine. Finish by moving the task into the new cgroup */ - ret = cgroup_attach_task(child, tsk); - mutex_unlock(&cgroup_mutex); - - out_release: - mutex_unlock(&inode->i_mutex); - - mutex_lock(&cgroup_mutex); - put_css_set(cg); - mutex_unlock(&cgroup_mutex); - deactivate_super(root->sb); - return ret; -} - -/** * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp * @cgrp: the cgroup in question * @task: the task in question diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e7bebb7c6c38..e691818d7e45 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss, */ static int freezer_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) + struct task_struct *task) { struct freezer *freezer; @@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss, if (freezer->state != CGROUP_THAWED) return -EBUSY; + return 0; +} + +static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ rcu_read_lock(); - if (__cgroup_freezing_or_frozen(task)) { + if (__cgroup_freezing_or_frozen(tsk)) { rcu_read_unlock(); return -EBUSY; } rcu_read_unlock(); - - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (__cgroup_freezing_or_frozen(c)) { - rcu_read_unlock(); - return -EBUSY; - } - } - rcu_read_unlock(); - } - return 0; } @@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = { .populate = freezer_populate, .subsys_id = freezer_subsys_id, .can_attach = freezer_can_attach, + .can_attach_task = freezer_can_attach_task, + .pre_attach = NULL, + .attach_task = NULL, .attach = NULL, .fork = freezer_fork, .exit = NULL, diff --git a/kernel/compat.c b/kernel/compat.c index 38b1d2c1cbe8..fc9eb093acd5 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) return compat_jiffies_to_clock_t(jiffies); } +#ifdef __ARCH_WANT_SYS_SIGPENDING + /* * Assumption: old_sigset_t and compat_old_sigset_t are both * types that can be passed to put_user()/get_user(). @@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) return ret; } +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK + asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, compat_old_sigset_t __user *oset) { @@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set, return ret; } +#endif + asmlinkage long compat_sys_setrlimit(unsigned int resource, struct compat_rlimit __user *rlim) { @@ -890,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, { compat_sigset_t s32; sigset_t s; - int sig; struct timespec t; siginfo_t info; - long ret, timeout = 0; + long ret; if (sigsetsize != sizeof(sigset_t)) return -EINVAL; @@ -901,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) return -EFAULT; sigset_from_compat(&s, &s32); - sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&s); if (uts) { - if (get_compat_timespec (&t, uts)) + if (get_compat_timespec(&t, uts)) return -EFAULT; - if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 - || t.tv_sec < 0) - return -EINVAL; } - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = timespec_to_jiffies(&t) - +(t.tv_sec || t.tv_nsec); - if (timeout) { - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &s); - - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &s, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - } - spin_unlock_irq(¤t->sighand->siglock); + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user32(uinfo, &info)) - ret = -EFAULT; - } - }else { - ret = timeout?-EINTR:-EAGAIN; + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; } + return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2bb8c2e98fff..1ceeb049c827 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1367,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } -/* Protected by cgroup_lock */ -static cpumask_var_t cpus_attach; - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct task_struct *tsk, bool threadgroup) + struct task_struct *tsk) { - int ret; struct cpuset *cs = cgroup_cs(cont); if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) @@ -1391,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (tsk->flags & PF_THREAD_BOUND) return -EINVAL; - ret = security_task_setscheduler(tsk); - if (ret) - return ret; - if (threadgroup) { - struct task_struct *c; - - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - ret = security_task_setscheduler(c); - if (ret) { - rcu_read_unlock(); - return ret; - } - } - rcu_read_unlock(); - } return 0; } -static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, - struct cpuset *cs) +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + return security_task_setscheduler(task); +} + +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in pre_attach, and they must + * persist among pre_attach, attach_task, and attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + +/* Set-up work for before attaching each task. */ +static void cpuset_pre_attach(struct cgroup *cont) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +} + +/* Per-thread attachment work. */ +static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) { int err; + struct cpuset *cs = cgroup_cs(cont); + /* * can_attach beforehand should guarantee that this doesn't fail. * TODO: have a better way to handle failure here @@ -1421,45 +1430,29 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, err = set_cpus_allowed_ptr(tsk, cpus_attach); WARN_ON_ONCE(err); - cpuset_change_task_nodemask(tsk, to); + cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); cpuset_update_task_spread_flag(cs, tsk); - } static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, - struct cgroup *oldcont, struct task_struct *tsk, - bool threadgroup) + struct cgroup *oldcont, struct task_struct *tsk) { struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); - static nodemask_t to; /* protected by cgroup_mutex */ - if (cs == &top_cpuset) { - cpumask_copy(cpus_attach, cpu_possible_mask); - } else { - guarantee_online_cpus(cs, cpus_attach); - } - guarantee_online_mems(cs, &to); - - /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); - } - rcu_read_unlock(); - } - - /* change mm; only needs to be done once even if threadgroup */ - to = cs->mems_allowed; + /* + * Change mm, possibly for multiple threads in a threadgroup. This is + * expensive and may sleep. + */ + cpuset_attach_nodemask_from = oldcs->mems_allowed; + cpuset_attach_nodemask_to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &oldcs->mems_allowed, &to); + cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + &cpuset_attach_nodemask_to); mmput(mm); } } @@ -1809,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) } /* - * post_clone() is called at the end of cgroup_clone(). - * 'cgroup' was just created automatically as a result of - * a cgroup_clone(), and the current task is about to - * be moved into 'cgroup'. + * post_clone() is called during cgroup_create() when the + * clone_children mount argument was specified. The cgroup + * can not yet have any tasks. * * Currently we refuse to set up the cgroup - thereby * refusing the task to be entered, and as a result refusing @@ -1911,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = { .create = cpuset_create, .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, + .can_attach_task = cpuset_can_attach_task, + .pre_attach = cpuset_pre_attach, + .attach_task = cpuset_attach_task, .attach = cpuset_attach, .populate = cpuset_populate, .post_clone = cpuset_post_clone, diff --git a/kernel/cred.c b/kernel/cred.c index 8093c16b84b1..174fa84eca30 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/credentials.txt +/* Task credentials management - see Documentation/security/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) @@ -49,10 +49,10 @@ struct cred init_cred = { .magic = CRED_MAGIC, #endif .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_INIT_INH_SET, + .cap_inheritable = CAP_EMPTY_SET, .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_INIT_EFF_SET, - .cap_bset = CAP_INIT_BSET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, diff --git a/kernel/exit.c b/kernel/exit.c index 8dd874181542..20a406471525 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1377,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) return NULL; } -/* - * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold - * the lock and this task is uninteresting. If we return nonzero, we have - * released the lock and the system call should return. +/** + * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED + * @wo: wait options + * @ptrace: is the wait for ptrace + * @p: task to wait for + * + * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. + * + * CONTEXT: + * read_lock(&tasklist_lock), which is released if return value is + * non-zero. Also, grabs and releases @p->sighand->siglock. + * + * RETURNS: + * 0 if wait condition didn't exist and search for other wait conditions + * should continue. Non-zero return, -errno on failure and @p's pid on + * success, implies that tasklist_lock is released and wait condition + * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, int ptrace, struct task_struct *p) @@ -1397,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo, if (!ptrace && !(wo->wo_flags & WUNTRACED)) return 0; + if (!task_stopped_code(p, ptrace)) + return 0; + exit_code = 0; spin_lock_irq(&p->sighand->siglock); @@ -1538,33 +1553,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, return 0; } - if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* dead body doesn't have much to contribute */ + if (p->exit_state == EXIT_DEAD) + return 0; + + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the real + * parent when the ptracer detaches. + */ + if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* it will become visible, clear notask_error */ + wo->notask_error = 0; + return 0; + } + + /* we don't reap group leaders with subthreads */ + if (!delay_group_leader(p)) + return wait_task_zombie(wo, p); + /* - * This child is hidden by ptrace. - * We aren't allowed to see it now, but eventually we will. + * Allow access to stopped/continued state via zombie by + * falling through. Clearing of notask_error is complex. + * + * When !@ptrace: + * + * If WEXITED is set, notask_error should naturally be + * cleared. If not, subset of WSTOPPED|WCONTINUED is set, + * so, if there are live subthreads, there are events to + * wait for. If all subthreads are dead, it's still safe + * to clear - this function will be called again in finite + * amount time once all the subthreads are released and + * will then return without clearing. + * + * When @ptrace: + * + * Stopped state is per-task and thus can't change once the + * target task dies. Only continued and exited can happen. + * Clear notask_error if WCONTINUED | WEXITED. + */ + if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) + wo->notask_error = 0; + } else { + /* + * If @p is ptraced by a task in its real parent's group, + * hide group stop/continued state when looking at @p as + * the real parent; otherwise, a single stop can be + * reported twice as group and ptrace stops. + * + * If a ptracer wants to distinguish the two events for its + * own children, it should create a separate process which + * takes the role of real parent. + */ + if (likely(!ptrace) && task_ptrace(p) && + same_thread_group(p->parent, p->real_parent)) + return 0; + + /* + * @p is alive and it's gonna stop, continue or exit, so + * there always is something to wait for. */ wo->notask_error = 0; - return 0; } - if (p->exit_state == EXIT_DEAD) - return 0; - /* - * We don't reap group leaders with subthreads. + * Wait for stopped. Depending on @ptrace, different stopped state + * is used and the two don't interact with each other. */ - if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) - return wait_task_zombie(wo, p); + ret = wait_task_stopped(wo, ptrace, p); + if (ret) + return ret; /* - * It's stopped or running now, so it might - * later continue, exit, or stop again. + * Wait for continued. There's only one continued state and the + * ptracer can consume it which can confuse the real parent. Don't + * use WCONTINUED from ptracer. You don't need or want it. */ - wo->notask_error = 0; - - if (task_stopped_code(p, ptrace)) - return wait_task_stopped(wo, ptrace, p); - return wait_task_continued(wo, p); } diff --git a/kernel/extable.c b/kernel/extable.c index c2d625fcda77..5339705b8241 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -72,6 +72,16 @@ int core_kernel_text(unsigned long addr) return 0; } +/** + * core_kernel_data - tell if addr points to kernel data + * @addr: address to test + * + * Returns true if @addr passed in is from the core kernel data + * section. + * + * Note: On some archs it may return true for core RODATA, and false + * for others. But will always be true for core RW data. + */ int core_kernel_data(unsigned long addr) { if (addr >= (unsigned long)_sdata && diff --git a/kernel/fork.c b/kernel/fork.c index 2b44d82b8237..ca406d916713 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -59,7 +59,6 @@ #include <linux/taskstats_kern.h> #include <linux/random.h> #include <linux/tty.h> -#include <linux/proc_fs.h> #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> @@ -383,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) get_file(file); if (tmp->vm_flags & VM_DENYWRITE) atomic_dec(&inode->i_writecount); - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); if (tmp->vm_flags & VM_SHARED) mapping->i_mmap_writable++; - tmp->vm_truncate_count = mpnt->vm_truncate_count; flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_prio_tree_add(tmp, mpnt); flush_dcache_mmap_unlock(mapping); - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* @@ -486,6 +484,20 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm) +{ +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL)) + return -ENOMEM; + + if (oldmm) + cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm)); + else + memset(mm_cpumask(mm), 0, cpumask_size()); +#endif + return 0; +} + static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) { atomic_set(&mm->mm_users, 1); @@ -522,10 +534,20 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - mm = mm_init(mm, current); + if (!mm) + return NULL; + + memset(mm, 0, sizeof(*mm)); + mm = mm_init(mm, current); + if (!mm) + return NULL; + + if (mm_init_cpumask(mm, NULL)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } + return mm; } @@ -537,6 +559,7 @@ struct mm_struct * mm_alloc(void) void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); + free_cpumask_var(mm->cpu_vm_mask_var); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_mm_destroy(mm); @@ -573,6 +596,57 @@ void mmput(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mmput); +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + /** * get_task_mm - acquire a reference to the task's mm * @@ -691,6 +765,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk) if (!mm_init(mm, tsk)) goto fail_nomem; + if (mm_init_cpumask(mm, oldmm)) + goto fail_nocpumask; + if (init_new_context(tsk, mm)) goto fail_nocontext; @@ -717,6 +794,9 @@ fail_nomem: return NULL; fail_nocontext: + free_cpumask_var(mm->cpu_vm_mask_var); + +fail_nocpumask: /* * If init_new_context() failed, we cannot use mmput() to free the mm * because it calls destroy_context() @@ -927,6 +1007,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif + sig->oom_adj = current->signal->oom_adj; sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; @@ -1108,6 +1192,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_lock(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1193,12 +1279,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) p->tgid = current->tgid; - if (current->nsproxy != p->nsproxy) { - retval = ns_cgroup_clone(p, pid); - if (retval) - goto bad_fork_free_pid; - } - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* * Clear TID on mm_release()? @@ -1312,6 +1392,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); perf_event_fork(p); return p; @@ -1350,6 +1432,8 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); cgroup_exit(p, cgroup_callbacks_done); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index dbbbf7d43080..a9205e32a059 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -64,17 +64,20 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clock_base = { { - .index = CLOCK_REALTIME, - .get_time = &ktime_get_real, + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, .resolution = KTIME_LOW_RES, }, { - .index = CLOCK_MONOTONIC, - .get_time = &ktime_get, + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, .resolution = KTIME_LOW_RES, }, { - .index = CLOCK_BOOTTIME, + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, .resolution = KTIME_LOW_RES, }, @@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, struct hrtimer_cpu_base *new_cpu_base; int this_cpu = smp_processor_id(); int cpu = hrtimer_get_target(this_cpu, pinned); - int basenum = hrtimer_clockid_to_base(base->index); + int basenum = base->index; again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); @@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer, return res; } - -/* - * Retrigger next event is called after clock was set - * - * Called with interrupts disabled via on_each_cpu() - */ -static void retrigger_next_event(void *arg) -{ - struct hrtimer_cpu_base *base; - struct timespec realtime_offset, wtm, sleep; - - if (!hrtimer_hres_active()) - return; - - get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm, - &sleep); - set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); - - base = &__get_cpu_var(hrtimer_bases); - - /* Adjust CLOCK_REALTIME offset */ - raw_spin_lock(&base->lock); - base->clock_base[HRTIMER_BASE_REALTIME].offset = - timespec_to_ktime(realtime_offset); - base->clock_base[HRTIMER_BASE_BOOTTIME].offset = - timespec_to_ktime(sleep); - - hrtimer_force_reprogram(base, 0); - raw_spin_unlock(&base->lock); -} - -/* - * Clock realtime was set - * - * Change the offset of the realtime clock vs. the monotonic - * clock. - * - * We might have to reprogram the high resolution timer interrupt. On - * SMP we call the architecture specific code to retrigger _all_ high - * resolution timer interrupts. On UP we just disable interrupts and - * call the high resolution interrupt code. - */ -void clock_was_set(void) -{ - /* Retrigger the CPU local events everywhere */ - on_each_cpu(retrigger_next_event, NULL, 1); -} - -/* - * During resume we might have to reprogram the high resolution timer - * interrupt (on the local CPU): - */ -void hres_timers_resume(void) -{ - WARN_ONCE(!irqs_disabled(), - KERN_INFO "hres_timers_resume() called with IRQs enabled!"); - - retrigger_next_event(NULL); -} - /* * Initialize the high resolution related parts of cpu_base */ @@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, } /* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset, xtim, wtm, sleep; + + if (!hrtimer_hres_active()) + return; + + /* Optimized out for !HIGH_RES */ + get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); + + /* Adjust CLOCK_REALTIME offset */ + raw_spin_lock(&base->lock); + base->clock_base[HRTIMER_BASE_REALTIME].offset = + timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); + + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* * Switch to high resolution mode */ static int hrtimer_switch_to_hres(void) { - int cpu = smp_processor_id(); + int i, cpu = smp_processor_id(); struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); unsigned long flags; @@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void) return 0; } base->hres_active = 1; - base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES; - base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES; - base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + base->clock_base[i].resolution = KTIME_HIGH_RES; tick_setup_sched_timer(); @@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, return 0; } static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +#endif + timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt (on the local CPU): + */ +void hrtimers_resume(void) +{ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + + retrigger_next_event(NULL); + timerfd_clock_was_set(); +} + static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) { #ifdef CONFIG_TIMER_STATS @@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer, debug_activate(timer); timerqueue_add(&base->active, &timer->node); + base->cpu_base->active_bases |= 1 << base->index; /* * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the @@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer, #endif } timerqueue_del(&base->active, &timer->node); + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); out: timer->state = newstate; } @@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); - struct hrtimer_clock_base *base; ktime_t expires_next, now, entry_time, delta; int i, retries = 0; @@ -1256,12 +1262,15 @@ retry: */ cpu_base->expires_next.tv64 = KTIME_MAX; - base = cpu_base->clock_base; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - ktime_t basenow; + struct hrtimer_clock_base *base; struct timerqueue_node *node; + ktime_t basenow; + + if (!(cpu_base->active_bases & (1 << i))) + continue; + base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1294,7 +1303,6 @@ retry: __run_hrtimer(timer, &basenow); } - base++; } /* @@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) struct timespec __user *rmtp; int ret = 0; - hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, + hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); @@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, restart = ¤t_thread_info()->restart_block; restart->fn = hrtimer_nanosleep_restart; - restart->nanosleep.index = t.timer.base->index; + restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.rmtp = rmtp; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 834899f2500f..4bd4faa6323a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir; #ifdef CONFIG_SMP -static int irq_affinity_proc_show(struct seq_file *m, void *v) +static int show_irq_affinity(int type, struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); const struct cpumask *mask = desc->irq_data.affinity; @@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v) if (irqd_is_setaffinity_pending(&desc->irq_data)) mask = desc->pending_mask; #endif - seq_cpumask(m, mask); + if (type) + seq_cpumask_list(m, mask); + else + seq_cpumask(m, mask); seq_putc(m, '\n'); return 0; } @@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) #endif int no_irq_affinity; -static ssize_t irq_affinity_proc_write(struct file *file, +static int irq_affinity_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(0, m, v); +} + +static int irq_affinity_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(1, m, v); +} + + +static ssize_t write_irq_affinity(int type, struct file *file, const char __user *buffer, size_t count, loff_t *pos) { unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; @@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file, if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) return -ENOMEM; - err = cpumask_parse_user(buffer, count, new_value); + if (type) + err = cpumask_parselist_user(buffer, count, new_value); + else + err = cpumask_parse_user(buffer, count, new_value); if (err) goto free_cpumask; @@ -100,11 +117,28 @@ free_cpumask: return err; } +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(0, file, buffer, count, pos); +} + +static ssize_t irq_affinity_list_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(1, file, buffer, count, pos); +} + static int irq_affinity_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_proc_show, PDE(inode)->data); } +static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); +} + static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) { return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); @@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = { .release = single_release, }; +static const struct file_operations irq_affinity_list_proc_fops = { + .open = irq_affinity_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_list_proc_write, +}; + static int default_affinity_show(struct seq_file *m, void *v) { seq_cpumask(m, irq_default_affinity); @@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("affinity_hint", 0400, desc->dir, &irq_affinity_hint_proc_fops, (void *)(long)irq); + /* create /proc/irq/<irq>/smp_affinity_list */ + proc_create_data("smp_affinity_list", 0600, desc->dir, + &irq_affinity_list_proc_fops, (void *)(long)irq); + proc_create_data("node", 0444, desc->dir, &irq_node_proc_fops, (void *)(long)irq); #endif @@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) #ifdef CONFIG_SMP remove_proc_entry("smp_affinity", desc->dir); remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("smp_affinity_list", desc->dir); remove_proc_entry("node", desc->dir); #endif remove_proc_entry("spurious", desc->dir); diff --git a/kernel/kmod.c b/kernel/kmod.c index 5ae0ff38425f..ad6a81c58b44 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -25,6 +25,7 @@ #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> +#include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/workqueue.h> @@ -43,6 +44,13 @@ extern int max_threads; static struct workqueue_struct *khelper_wq; +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); + #ifdef CONFIG_MODULES /* @@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module); static int ____call_usermodehelper(void *data) { struct subprocess_info *sub_info = data; + struct cred *new; int retval; spin_lock_irq(¤t->sighand->siglock); @@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data) goto fail; } + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto fail; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + commit_creds(new); + retval = kernel_execve(sub_info->path, (const char *const *)sub_info->argv, (const char *const *)sub_info->envp); @@ -420,6 +442,84 @@ unlock: } EXPORT_SYMBOL(call_usermodehelper_exec); +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; + void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/module.c b/kernel/module.c index 22879725678d..795bdc7f5c3f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2812,7 +2812,7 @@ static struct module *load_module(void __user *umod, } /* This has to be done once we're sure module name is unique. */ - if (!mod->taints) + if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) dynamic_debug_setup(info.debug, info.num_debug); /* Find duplicate symbols */ @@ -2849,7 +2849,7 @@ static struct module *load_module(void __user *umod, module_bug_cleanup(mod); ddebug: - if (!mod->taints) + if (!mod->taints || mod->taints == (1U<<TAINT_CRAP)) dynamic_debug_remove(info.debug); unlock: mutex_unlock(&module_mutex); diff --git a/kernel/mutex.c b/kernel/mutex.c index 2c938e2337cd..d607ed5dd441 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock); */ static inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, - unsigned long ip) + struct lockdep_map *nest_lock, unsigned long ip) { struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; preempt_disable(); - mutex_acquire(&lock->dep_map, subclass, 0, ip); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER /* @@ -269,16 +269,25 @@ void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_nested); +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + int __sched mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); - return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -287,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, _RET_IP_); + subclass, NULL, _RET_IP_); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -393,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); } static noinline int __sched @@ -401,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); } static noinline int __sched @@ -409,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); - return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); } #endif diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c deleted file mode 100644 index 2c98ad94ba0e..000000000000 --- a/kernel/ns_cgroup.c +++ /dev/null @@ -1,118 +0,0 @@ -/* - * ns_cgroup.c - namespace cgroup subsystem - * - * Copyright 2006, 2007 IBM Corp - */ - -#include <linux/module.h> -#include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/proc_fs.h> -#include <linux/slab.h> -#include <linux/nsproxy.h> - -struct ns_cgroup { - struct cgroup_subsys_state css; -}; - -struct cgroup_subsys ns_subsys; - -static inline struct ns_cgroup *cgroup_to_ns( - struct cgroup *cgroup) -{ - return container_of(cgroup_subsys_state(cgroup, ns_subsys_id), - struct ns_cgroup, css); -} - -int ns_cgroup_clone(struct task_struct *task, struct pid *pid) -{ - char name[PROC_NUMBUF]; - - snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid)); - return cgroup_clone(task, &ns_subsys, name); -} - -/* - * Rules: - * 1. you can only enter a cgroup which is a descendant of your current - * cgroup - * 2. you can only place another process into a cgroup if - * a. you have CAP_SYS_ADMIN - * b. your cgroup is an ancestor of task's destination cgroup - * (hence either you are in the same cgroup as task, or in an - * ancestor cgroup thereof) - */ -static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup, - struct task_struct *task, bool threadgroup) -{ - if (current != task) { - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!cgroup_is_descendant(new_cgroup, current)) - return -EPERM; - } - - if (!cgroup_is_descendant(new_cgroup, task)) - return -EPERM; - - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - if (!cgroup_is_descendant(new_cgroup, c)) { - rcu_read_unlock(); - return -EPERM; - } - } - rcu_read_unlock(); - } - - return 0; -} - -/* - * Rules: you can only create a cgroup if - * 1. you are capable(CAP_SYS_ADMIN) - * 2. the target cgroup is a descendant of your own cgroup - */ -static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - if (!capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - if (!cgroup_is_descendant(cgroup, current)) - return ERR_PTR(-EPERM); - if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) { - printk("ns_cgroup can't be created with parent " - "'clone_children' set.\n"); - return ERR_PTR(-EINVAL); - } - - printk_once("ns_cgroup deprecated: consider using the " - "'clone_children' flag without the ns_cgroup.\n"); - - ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL); - if (!ns_cgroup) - return ERR_PTR(-ENOMEM); - return &ns_cgroup->css; -} - -static void ns_destroy(struct cgroup_subsys *ss, - struct cgroup *cgroup) -{ - struct ns_cgroup *ns_cgroup; - - ns_cgroup = cgroup_to_ns(cgroup); - kfree(ns_cgroup); -} - -struct cgroup_subsys ns_subsys = { - .name = "ns", - .can_attach = ns_can_attach, - .create = ns_create, - .destroy = ns_destroy, - .subsys_id = ns_subsys_id, -}; diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index a05d191ffdd9..d6a00f3de15d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -22,6 +22,9 @@ #include <linux/pid_namespace.h> #include <net/net_namespace.h> #include <linux/ipc_namespace.h> +#include <linux/proc_fs.h> +#include <linux/file.h> +#include <linux/syscalls.h> static struct kmem_cache *nsproxy_cachep; @@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, goto out; } - err = ns_cgroup_clone(current, task_pid(current)); - if (err) - put_nsproxy(*new_nsp); - out: return err; } @@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p) switch_task_namespaces(p, NULL); } +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ + const struct proc_ns_operations *ops; + struct task_struct *tsk = current; + struct nsproxy *new_nsproxy; + struct proc_inode *ei; + struct file *file; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return PTR_ERR(file); + + err = -EINVAL; + ei = PROC_I(file->f_dentry->d_inode); + ops = ei->ns_ops; + if (nstype && (ops->type != nstype)) + goto out; + + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + if (IS_ERR(new_nsproxy)) { + err = PTR_ERR(new_nsproxy); + goto out; + } + + err = ops->install(new_nsproxy, ei->ns); + if (err) { + free_nsproxy(new_nsproxy); + goto out; + } + switch_task_namespaces(tsk, new_nsproxy); +out: + fput(file); + return err; +} + static int __init nsproxy_cache_init(void) { nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index 68df076fec5d..fd8d1e035df9 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -386,7 +386,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, s32 value; unsigned long flags; struct pm_qos_object *o; - struct pm_qos_request_list *pm_qos_req = filp->private_data;; + struct pm_qos_request_list *pm_qos_req = filp->private_data; if (!pm_qos_req) return -EINVAL; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 0791b13df7bf..58f405b581e7 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; - restart_block->nanosleep.index = which_clock; + restart_block->nanosleep.clockid = which_clock; restart_block->nanosleep.rmtp = rmtp; restart_block->nanosleep.expires = timespec_to_ns(rqtp); } @@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->nanosleep.index; + clockid_t which_clock = restart_block->nanosleep.clockid; struct timespec t; struct itimerspec it; int error; diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index e5498d7405c3..4556182527f3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void) return tmr; } +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + #define IT_ID_SET 1 #define IT_ID_NOT_SET 0 static void release_posix_timer(struct k_itimer *tmr, int it_id_set) @@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set) } put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); - kmem_cache_free(posix_timers_cache, tmr); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); } static struct k_clock *clockid_to_kclock(const clockid_t id) @@ -631,22 +638,18 @@ out: static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; - /* - * Watch out here. We do a irqsave on the idr_lock and pass the - * flags part over to the timer lock. Must not let interrupts in - * while we are moving the lock. - */ - spin_lock_irqsave(&idr_lock, *flags); + + rcu_read_lock(); timr = idr_find(&posix_timers_id, (int)timer_id); if (timr) { - spin_lock(&timr->it_lock); + spin_lock_irqsave(&timr->it_lock, *flags); if (timr->it_signal == current->signal) { - spin_unlock(&idr_lock); + rcu_read_unlock(); return timr; } - spin_unlock(&timr->it_lock); + spin_unlock_irqrestore(&timr->it_lock, *flags); } - spin_unlock_irqrestore(&idr_lock, *flags); + rcu_read_unlock(); return NULL; } @@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, */ long clock_nanosleep_restart(struct restart_block *restart_block) { - clockid_t which_clock = restart_block->nanosleep.index; + clockid_t which_clock = restart_block->nanosleep.clockid; struct k_clock *kc = clockid_to_kclock(which_clock); if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) diff --git a/kernel/printk.c b/kernel/printk.c index da8ca817eae3..35185392173f 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -31,6 +31,7 @@ #include <linux/smp.h> #include <linux/security.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/syscalls.h> #include <linux/kexec.h> #include <linux/kdb.h> @@ -167,46 +168,74 @@ void log_buf_kexec_setup(void) } #endif +/* requested log_buf_len from kernel cmdline */ +static unsigned long __initdata new_log_buf_len; + +/* save requested log_buf_len since it's too early to process it */ static int __init log_buf_len_setup(char *str) { unsigned size = memparse(str, &str); - unsigned long flags; if (size) size = roundup_pow_of_two(size); - if (size > log_buf_len) { - unsigned start, dest_idx, offset; - char *new_log_buf; + if (size > log_buf_len) + new_log_buf_len = size; - new_log_buf = alloc_bootmem(size); - if (!new_log_buf) { - printk(KERN_WARNING "log_buf_len: allocation failed\n"); - goto out; - } + return 0; +} +early_param("log_buf_len", log_buf_len_setup); - spin_lock_irqsave(&logbuf_lock, flags); - log_buf_len = size; - log_buf = new_log_buf; - - offset = start = min(con_start, log_start); - dest_idx = 0; - while (start != log_end) { - log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)]; - start++; - dest_idx++; - } - log_start -= offset; - con_start -= offset; - log_end -= offset; - spin_unlock_irqrestore(&logbuf_lock, flags); +void __init setup_log_buf(int early) +{ + unsigned long flags; + unsigned start, dest_idx, offset; + char *new_log_buf; + int free; + + if (!new_log_buf_len) + return; + + if (early) { + unsigned long mem; - printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len); + mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); + if (mem == MEMBLOCK_ERROR) + return; + new_log_buf = __va(mem); + } else { + new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); } -out: - return 1; -} -__setup("log_buf_len=", log_buf_len_setup); + if (unlikely(!new_log_buf)) { + pr_err("log_buf_len: %ld bytes not available\n", + new_log_buf_len); + return; + } + + spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; + free = __LOG_BUF_LEN - log_end; + + offset = start = min(con_start, log_start); + dest_idx = 0; + while (start != log_end) { + unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); + + log_buf[dest_idx] = __log_buf[log_idx_mask]; + start++; + dest_idx++; + } + log_start -= offset; + con_start -= offset; + log_end -= offset; + spin_unlock_irqrestore(&logbuf_lock, flags); + + pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("early log buf free: %d(%d%%)\n", + free, (free * 100) / __LOG_BUF_LEN); +} #ifdef CONFIG_BOOT_PRINTK_DELAY diff --git a/kernel/profile.c b/kernel/profile.c index 66f841b7fbd3..961b389fe52f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -126,11 +126,9 @@ int __ref profile_init(void) if (prof_buffer) return 0; - prof_buffer = vmalloc(buffer_bytes); - if (prof_buffer) { - memset(prof_buffer, 0, buffer_bytes); + prof_buffer = vzalloc(buffer_bytes); + if (prof_buffer) return 0; - } free_cpumask_var(prof_cpu_mask); return -ENOMEM; @@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void) mutex_unlock(&profile_flip_mutex); } -void profile_hits(int type, void *__pc, unsigned int nr_hits) +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long primary, secondary, flags, pc = (unsigned long)__pc; int i, j, cpu; struct profile_hit *hits; - if (prof_on != type || !prof_buffer) - return; pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; @@ -419,16 +415,20 @@ out_free: #define profile_discard_flip_buffers() do { } while (0) #define profile_cpu_callback NULL -void profile_hits(int type, void *__pc, unsigned int nr_hits) +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) { unsigned long pc; - - if (prof_on != type || !prof_buffer) - return; pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); } #endif /* !CONFIG_SMP */ + +void profile_hits(int type, void *__pc, unsigned int nr_hits) +{ + if (prof_on != type || !prof_buffer) + return; + do_profile_hits(type, __pc, nr_hits); +} EXPORT_SYMBOL_GPL(profile_hits); void profile_tick(int type) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index dc7ab65f3b36..2df115790cd9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -38,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) child->parent = new_parent; } -/* - * Turn a tracing stop into a normal stop now, since with no tracer there - * would be no way to wake it up with SIGCONT or SIGKILL. If there was a - * signal sent that would resume the child, but didn't because it was in - * TASK_TRACED, resume it now. - * Requires that irqs be disabled. - */ -static void ptrace_untrace(struct task_struct *child) -{ - spin_lock(&child->sighand->siglock); - if (task_is_traced(child)) { - /* - * If the group stop is completed or in progress, - * this thread was already counted as stopped. - */ - if (child->signal->flags & SIGNAL_STOP_STOPPED || - child->signal->group_stop_count) - __set_task_state(child, TASK_STOPPED); - else - signal_wake_up(child, 1); - } - spin_unlock(&child->sighand->siglock); -} - -/* - * unptrace a task: move it back to its original parent and - * remove it from the ptrace list. +/** + * __ptrace_unlink - unlink ptracee and restore its execution state + * @child: ptracee to be unlinked * - * Must be called with the tasklist lock write-held. + * Remove @child from the ptrace list, move it back to the original parent, + * and restore the execution state so that it conforms to the group stop + * state. + * + * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer + * exiting. For PTRACE_DETACH, unless the ptracee has been killed between + * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. + * If the ptracer is exiting, the ptracee can be in any state. + * + * After detach, the ptracee should be in a state which conforms to the + * group stop. If the group is stopped or in the process of stopping, the + * ptracee should be put into TASK_STOPPED; otherwise, it should be woken + * up from TASK_TRACED. + * + * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, + * it goes through TRACED -> RUNNING -> STOPPED transition which is similar + * to but in the opposite direction of what happens while attaching to a + * stopped task. However, in this direction, the intermediate RUNNING + * state is not hidden even from the current ptracer and if it immediately + * re-attaches and performs a WNOHANG wait(2), it may fail. + * + * CONTEXT: + * write_lock_irq(tasklist_lock) */ void __ptrace_unlink(struct task_struct *child) { @@ -76,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child) child->parent = child->real_parent; list_del_init(&child->ptrace_entry); - if (task_is_traced(child)) - ptrace_untrace(child); + spin_lock(&child->sighand->siglock); + + /* + * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * @child isn't dead. + */ + if (!(child->flags & PF_EXITING) && + (child->signal->flags & SIGNAL_STOP_STOPPED || + child->signal->group_stop_count)) + child->group_stop |= GROUP_STOP_PENDING; + + /* + * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick + * @child in the butt. Note that @resume should be used iff @child + * is in TASK_TRACED; otherwise, we might unduly disrupt + * TASK_KILLABLE sleeps. + */ + if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + signal_wake_up(child, task_is_traced(child)); + + spin_unlock(&child->sighand->siglock); } /* @@ -96,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill) */ read_lock(&tasklist_lock); if ((child->ptrace & PT_PTRACED) && child->parent == current) { - ret = 0; /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ spin_lock_irq(&child->sighand->siglock); - if (task_is_stopped(child)) - child->state = TASK_TRACED; - else if (!task_is_traced(child) && !kill) - ret = -ESRCH; + WARN_ON_ONCE(task_is_stopped(child)); + if (task_is_traced(child) || kill) + ret = 0; spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); @@ -169,6 +184,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) static int ptrace_attach(struct task_struct *task) { + bool wait_trap = false; int retval; audit_ptrace(task); @@ -208,12 +224,42 @@ static int ptrace_attach(struct task_struct *task) __ptrace_link(task, current); send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + spin_lock(&task->sighand->siglock); + + /* + * If the task is already STOPPED, set GROUP_STOP_PENDING and + * TRAPPING, and kick it so that it transits to TRACED. TRAPPING + * will be cleared if the child completes the transition or any + * event which clears the group stop states happens. We'll wait + * for the transition to complete before returning from this + * function. + * + * This hides STOPPED -> RUNNING -> TRACED transition from the + * attaching thread but a different thread in the same group can + * still observe the transient RUNNING state. IOW, if another + * thread's WNOHANG wait(2) on the stopped tracee races against + * ATTACH, the wait(2) may fail due to the transient RUNNING. + * + * The following task_is_stopped() test is safe as both transitions + * in and out of STOPPED are protected by siglock. + */ + if (task_is_stopped(task)) { + task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + signal_wake_up(task, 1); + wait_trap = true; + } + + spin_unlock(&task->sighand->siglock); + retval = 0; unlock_tasklist: write_unlock_irq(&tasklist_lock); unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: + if (wait_trap) + wait_event(current->signal->wait_chldexit, + !(task->group_stop & GROUP_STOP_TRAPPING)); return retval; } @@ -316,8 +362,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) if (child->ptrace) { child->exit_code = data; dead = __ptrace_detach(current, child); - if (!child->exit_state) - wake_up_state(child, TASK_TRACED | TASK_STOPPED); } write_unlock_irq(&tasklist_lock); @@ -518,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request, } child->exit_code = data; - wake_up_process(child); + wake_up_state(child, __TASK_TRACED); return 0; } diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 421abfd3641d..7bbac7d0f5ab 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -35,6 +35,7 @@ #include <linux/init.h> #include <linux/time.h> #include <linux/cpu.h> +#include <linux/prefetch.h> /* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ static struct task_struct *rcu_kthread_task; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e486f7c3ffb8..f07d2f03181a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -49,6 +49,7 @@ #include <linux/kernel_stat.h> #include <linux/wait.h> #include <linux/kthread.h> +#include <linux/prefetch.h> #include "rcutree.h" diff --git a/kernel/sched.c b/kernel/sched.c index c62acf45d3b9..5e43e9dc65d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock); * limitation from this.) */ #define MIN_SHARES 2 -#define MAX_SHARES (1UL << 18) +#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION)) static int root_task_group_load = ROOT_TASK_GROUP_LOAD; #endif @@ -1330,13 +1330,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - tmp = (u64)delta_exec * weight; + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; if (!lw->inv_weight) { - if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; else - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = WMULT_CONST / w; } /* @@ -1778,17 +1790,20 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + /* * SCHED_IDLE tasks get minimal weight: */ if (p->policy == SCHED_IDLE) { - p->se.load.weight = WEIGHT_IDLEPRIO; - p->se.load.inv_weight = WMULT_IDLEPRIO; + load->weight = scale_load(WEIGHT_IDLEPRIO); + load->inv_weight = WMULT_IDLEPRIO; return; } - p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; - p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; + load->weight = scale_load(prio_to_weight[prio]); + load->inv_weight = prio_to_wmult[prio]; } static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -2564,7 +2579,7 @@ static void ttwu_queue(struct task_struct *p, int cpu) { struct rq *rq = cpu_rq(cpu); -#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE) +#if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { ttwu_queue_remote(p, cpu); return; @@ -6527,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->cpu_power != SCHED_LOAD_SCALE) { + if (group->cpu_power != SCHED_POWER_SCALE) { printk(KERN_CONT " (cpu_power = %d)", group->cpu_power); } @@ -7902,7 +7917,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_power = SCHED_LOAD_SCALE; + rq->cpu_power = SCHED_POWER_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; @@ -8749,42 +8764,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) return 0; } -static int -cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct task_struct *tsk, bool threadgroup) -{ - int retval = cpu_cgroup_can_attach_task(cgrp, tsk); - if (retval) - return retval; - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - retval = cpu_cgroup_can_attach_task(cgrp, c); - if (retval) { - rcu_read_unlock(); - return retval; - } - } - rcu_read_unlock(); - } - return 0; -} - static void -cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cont, struct task_struct *tsk, - bool threadgroup) +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { sched_move_task(tsk); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - sched_move_task(c); - } - rcu_read_unlock(); - } } static void @@ -8806,14 +8789,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), shareval); + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); } static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); - return (u64) tg->shares; + return (u64) scale_load_down(tg->shares); } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8872,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", .create = cpu_cgroup_create, .destroy = cpu_cgroup_destroy, - .can_attach = cpu_cgroup_can_attach, - .attach = cpu_cgroup_attach, + .can_attach_task = cpu_cgroup_can_attach_task, + .attach_task = cpu_cgroup_attach_task, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 37f22626225e..e32a9b70ee9c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1584,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -1722,7 +1722,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) nr_running += cpu_rq(i)->cfs.nr_running; } - capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); if (tmp->flags & SD_POWERSAVINGS_BALANCE) nr_running /= 2; @@ -2570,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) { - return SCHED_LOAD_SCALE; + return SCHED_POWER_SCALE; } unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) @@ -2607,10 +2607,10 @@ unsigned long scale_rt_power(int cpu) available = total - rq->rt_avg; } - if (unlikely((s64)total < SCHED_LOAD_SCALE)) - total = SCHED_LOAD_SCALE; + if (unlikely((s64)total < SCHED_POWER_SCALE)) + total = SCHED_POWER_SCALE; - total >>= SCHED_LOAD_SHIFT; + total >>= SCHED_POWER_SHIFT; return div_u64(available, total); } @@ -2618,7 +2618,7 @@ unsigned long scale_rt_power(int cpu) static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = sd->span_weight; - unsigned long power = SCHED_LOAD_SCALE; + unsigned long power = SCHED_POWER_SCALE; struct sched_group *sdg = sd->groups; if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { @@ -2627,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_smt_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; } sdg->cpu_power_orig = power; @@ -2637,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) else power *= default_scale_freq_power(sd, cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; power *= scale_rt_power(cpu); - power >>= SCHED_LOAD_SHIFT; + power >>= SCHED_POWER_SHIFT; if (!power) power = 1; @@ -2682,7 +2682,7 @@ static inline int fix_small_capacity(struct sched_domain *sd, struct sched_group *group) { /* - * Only siblings can have significantly less than SCHED_LOAD_SCALE + * Only siblings can have significantly less than SCHED_POWER_SCALE */ if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; @@ -2770,7 +2770,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power; /* * Consider the group unbalanced when the imbalance is larger @@ -2787,7 +2787,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); + sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, + SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(sd, group); sgs->group_weight = group->group_weight; @@ -2961,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd, return 0; *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power, - SCHED_LOAD_SCALE); + SCHED_POWER_SCALE); return 1; } @@ -2990,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, cpu_avg_load_per_task(this_cpu); scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_LOAD_SCALE; + * SCHED_POWER_SCALE; scaled_busy_load_per_task /= sds->busiest->cpu_power; if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= @@ -3009,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, min(sds->busiest_load_per_task, sds->max_load); pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); - pwr_now /= SCHED_LOAD_SCALE; + pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->busiest->cpu_power; if (sds->max_load > tmp) pwr_move += sds->busiest->cpu_power * @@ -3020,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, /* Amount of load we'd add */ if (sds->max_load * sds->busiest->cpu_power < - sds->busiest_load_per_task * SCHED_LOAD_SCALE) + sds->busiest_load_per_task * SCHED_POWER_SCALE) tmp = (sds->max_load * sds->busiest->cpu_power) / sds->this->cpu_power; else - tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / sds->this->cpu_power; pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; + pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) @@ -3070,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, load_above_capacity = (sds->busiest_nr_running - sds->busiest_group_capacity); - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE); + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); load_above_capacity /= sds->busiest->cpu_power; } @@ -3090,7 +3091,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, /* How much load to actually move to equalise the imbalance */ *imbalance = min(max_pull * sds->busiest->cpu_power, (sds->avg_load - sds->this_load) * sds->this->cpu_power) - / SCHED_LOAD_SCALE; + / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -3159,7 +3160,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (!sds.busiest || sds.busiest_nr_running == 0) goto out_balanced; - sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; /* * If the busiest group is imbalanced the below checks don't @@ -3238,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, for_each_cpu(i, sched_group_cpus(group)) { unsigned long power = power_of(i); - unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); unsigned long wl; if (!capacity) @@ -3263,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group, * the load can be moved away from the cpu that is potentially * running at a lower capacity. */ - wl = (wl * SCHED_LOAD_SCALE) / power; + wl = (wl * SCHED_POWER_SCALE) / power; if (wl > max_load) { max_load = wl; diff --git a/kernel/signal.c b/kernel/signal.c index 7165af5f1b11..86c32b884f8e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) static int recalc_sigpending_tsk(struct task_struct *t) { - if (t->signal->group_stop_count > 0 || + if ((t->group_stop & GROUP_STOP_PENDING) || PENDING(&t->pending, &t->blocked) || PENDING(&t->signal->shared_pending, &t->blocked)) { set_tsk_thread_flag(t, TIF_SIGPENDING); @@ -223,6 +223,83 @@ static inline void print_dropped_signal(int sig) current->comm, current->pid, sig); } +/** + * task_clear_group_stop_trapping - clear group stop trapping bit + * @task: target task + * + * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it + * and wake up the ptracer. Note that we don't need any further locking. + * @task->siglock guarantees that @task->parent points to the ptracer. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void task_clear_group_stop_trapping(struct task_struct *task) +{ + if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { + task->group_stop &= ~GROUP_STOP_TRAPPING; + __wake_up_sync_key(&task->parent->signal->wait_chldexit, + TASK_UNINTERRUPTIBLE, 1, task); + } +} + +/** + * task_clear_group_stop_pending - clear pending group stop + * @task: target task + * + * Clear group stop states for @task. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +void task_clear_group_stop_pending(struct task_struct *task) +{ + task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | + GROUP_STOP_DEQUEUED); +} + +/** + * task_participate_group_stop - participate in a group stop + * @task: task participating in a group stop + * + * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * Group stop states are cleared and the group stop count is consumed if + * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * stop, the appropriate %SIGNAL_* flags are set. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if group stop completion should be notified to the parent, %false + * otherwise. + */ +static bool task_participate_group_stop(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + bool consume = task->group_stop & GROUP_STOP_CONSUME; + + WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + + task_clear_group_stop_pending(task); + + if (!consume) + return false; + + if (!WARN_ON_ONCE(sig->group_stop_count == 0)) + sig->group_stop_count--; + + /* + * Tell the caller to notify completion iff we are entering into a + * fresh group stop. Read comment in do_signal_stop() for details. + */ + if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { + sig->flags = SIGNAL_STOP_STOPPED; + return true; + } + return false; +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an @@ -527,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * is to alert stop-signal processing code when another * processor has come along and cleared the flag. */ - tsk->signal->flags |= SIGNAL_STOP_DEQUEUED; + current->group_stop |= GROUP_STOP_DEQUEUED; } if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { /* @@ -592,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) if (sigisemptyset(&m)) return 0; - signandsets(&s->signal, &s->signal, mask); + sigandnsets(&s->signal, &s->signal, mask); list_for_each_entry_safe(q, n, &s->list, list) { if (sigismember(mask, q->info.si_signo)) { list_del_init(&q->list); @@ -727,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) } else if (sig == SIGCONT) { unsigned int why; /* - * Remove all stop signals from all queues, - * and wake all threads. + * Remove all stop signals from all queues, wake all threads. */ rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); t = p; do { - unsigned int state; + task_clear_group_stop_pending(t); rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); - /* - * If there is a handler for SIGCONT, we must make - * sure that no thread returns to user mode before - * we post the signal, in case it was the only - * thread eligible to run the signal handler--then - * it must not do anything between resuming and - * running the handler. With the TIF_SIGPENDING - * flag set, the thread will pause and acquire the - * siglock that we hold now and until we've queued - * the pending signal. - * - * Wake up the stopped thread _after_ setting - * TIF_SIGPENDING - */ - state = __TASK_STOPPED; - if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { - set_tsk_thread_flag(t, TIF_SIGPENDING); - state |= TASK_INTERRUPTIBLE; - } - wake_up_state(t, state); + wake_up_state(t, __TASK_STOPPED); } while_each_thread(p, t); /* @@ -780,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) signal->flags = why | SIGNAL_STOP_CONTINUED; signal->group_stop_count = 0; signal->group_exit_code = 0; - } else { - /* - * We are not stopped, but there could be a stop - * signal in the middle of being processed after - * being removed from the queue. Clear that too. - */ - signal->flags &= ~SIGNAL_STOP_DEQUEUED; } } @@ -875,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group) signal->group_stop_count = 0; t = p; do { + task_clear_group_stop_pending(t); sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } while_each_thread(p, t); @@ -1109,6 +1160,7 @@ int zap_other_threads(struct task_struct *p) p->signal->group_stop_count = 0; while_each_thread(p, t) { + task_clear_group_stop_pending(t); count++; /* Don't bother with already dead threads */ @@ -1536,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig) return ret; } -static void do_notify_parent_cldstop(struct task_struct *tsk, int why) +/** + * do_notify_parent_cldstop - notify parent of stopped/continued state change + * @tsk: task reporting the state change + * @for_ptracer: the notification is for ptracer + * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report + * + * Notify @tsk's parent that the stopped/continued state has changed. If + * @for_ptracer is %false, @tsk's group leader notifies to its real parent. + * If %true, @tsk reports to @tsk->parent which should be the ptracer. + * + * CONTEXT: + * Must be called with tasklist_lock at least read locked. + */ +static void do_notify_parent_cldstop(struct task_struct *tsk, + bool for_ptracer, int why) { struct siginfo info; unsigned long flags; struct task_struct *parent; struct sighand_struct *sighand; - if (task_ptrace(tsk)) + if (for_ptracer) { parent = tsk->parent; - else { + } else { tsk = tsk->group_leader; parent = tsk->real_parent; } @@ -1621,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk) } /* + * Test whether the target task of the usual cldstop notification - the + * real_parent of @child - is in the same group as the ptracer. + */ +static bool real_parent_is_ptracer(struct task_struct *child) +{ + return same_thread_group(child->parent, child->real_parent); +} + +/* * This must be called with current->sighand->siglock held. * * This should be the path for all ptrace stops. @@ -1631,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk) * If we actually decide not to stop at all because the tracer * is gone, we keep current->exit_code unless clear_code. */ -static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) +static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) __releases(¤t->sighand->siglock) __acquires(¤t->sighand->siglock) { + bool gstop_done = false; + if (arch_ptrace_stop_needed(exit_code, info)) { /* * The arch code has something special to do before a @@ -1655,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) } /* - * If there is a group stop in progress, - * we must participate in the bookkeeping. + * If @why is CLD_STOPPED, we're trapping to participate in a group + * stop. Do the bookkeeping. Note that if SIGCONT was delievered + * while siglock was released for the arch hook, PENDING could be + * clear now. We act as if SIGCONT is received after TASK_TRACED + * is entered - ignore it. */ - if (current->signal->group_stop_count > 0) - --current->signal->group_stop_count; + if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) + gstop_done = task_participate_group_stop(current); current->last_siginfo = info; current->exit_code = exit_code; - /* Let the debugger run. */ - __set_current_state(TASK_TRACED); + /* + * TRACED should be visible before TRAPPING is cleared; otherwise, + * the tracer might fail do_wait(). + */ + set_current_state(TASK_TRACED); + + /* + * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and + * transition to TASK_TRACED should be atomic with respect to + * siglock. This hsould be done after the arch hook as siglock is + * released and regrabbed across it. + */ + task_clear_group_stop_trapping(current); + spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); if (may_ptrace_stop()) { - do_notify_parent_cldstop(current, CLD_TRAPPED); + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + do_notify_parent_cldstop(current, true, why); + if (gstop_done && !real_parent_is_ptracer(current)) + do_notify_parent_cldstop(current, false, why); + /* * Don't want to allow preemption here, because * sys_ptrace() needs this task to be inactive. @@ -1684,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) /* * By the time we got the lock, our tracer went away. * Don't drop the lock yet, another tracer may come. + * + * If @gstop_done, the ptracer went away between group stop + * completion and here. During detach, it would have set + * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED + * in do_signal_stop() on return, so notifying the real + * parent of the group stop completion is enough. */ + if (gstop_done) + do_notify_parent_cldstop(current, false, why); + __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; @@ -1728,7 +1842,7 @@ void ptrace_notify(int exit_code) /* Let the debugger run. */ spin_lock_irq(¤t->sighand->siglock); - ptrace_stop(exit_code, 1, &info); + ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); spin_unlock_irq(¤t->sighand->siglock); } @@ -1741,66 +1855,115 @@ void ptrace_notify(int exit_code) static int do_signal_stop(int signr) { struct signal_struct *sig = current->signal; - int notify; - if (!sig->group_stop_count) { + if (!(current->group_stop & GROUP_STOP_PENDING)) { + unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; struct task_struct *t; - if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) || + /* signr will be recorded in task->group_stop for retries */ + WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + + if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || unlikely(signal_group_exit(sig))) return 0; /* - * There is no group stop already in progress. - * We must initiate one now. + * There is no group stop already in progress. We must + * initiate one now. + * + * While ptraced, a task may be resumed while group stop is + * still in effect and then receive a stop signal and + * initiate another group stop. This deviates from the + * usual behavior as two consecutive stop signals can't + * cause two group stops when !ptraced. That is why we + * also check !task_is_stopped(t) below. + * + * The condition can be distinguished by testing whether + * SIGNAL_STOP_STOPPED is already set. Don't generate + * group_exit_code in such case. + * + * This is not necessary for SIGNAL_STOP_CONTINUED because + * an intervening stop signal is required to cause two + * continued events regardless of ptrace. */ - sig->group_exit_code = signr; + if (!(sig->flags & SIGNAL_STOP_STOPPED)) + sig->group_exit_code = signr; + else + WARN_ON_ONCE(!task_ptrace(current)); + current->group_stop &= ~GROUP_STOP_SIGMASK; + current->group_stop |= signr | gstop; sig->group_stop_count = 1; - for (t = next_thread(current); t != current; t = next_thread(t)) + for (t = next_thread(current); t != current; + t = next_thread(t)) { + t->group_stop &= ~GROUP_STOP_SIGMASK; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, * so this check has no races. */ - if (!(t->flags & PF_EXITING) && - !task_is_stopped_or_traced(t)) { + if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { + t->group_stop |= signr | gstop; sig->group_stop_count++; signal_wake_up(t, 0); } + } } - /* - * If there are no other threads in the group, or if there is - * a group stop in progress and we are the last to stop, report - * to the parent. When ptraced, every thread reports itself. - */ - notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0; - notify = tracehook_notify_jctl(notify, CLD_STOPPED); - /* - * tracehook_notify_jctl() can drop and reacquire siglock, so - * we keep ->group_stop_count != 0 before the call. If SIGCONT - * or SIGKILL comes in between ->group_stop_count == 0. - */ - if (sig->group_stop_count) { - if (!--sig->group_stop_count) - sig->flags = SIGNAL_STOP_STOPPED; - current->exit_code = sig->group_exit_code; +retry: + if (likely(!task_ptrace(current))) { + int notify = 0; + + /* + * If there are no other threads in the group, or if there + * is a group stop in progress and we are the last to stop, + * report to the parent. + */ + if (task_participate_group_stop(current)) + notify = CLD_STOPPED; + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Notify the parent of the group stop completion. Because + * we're not holding either the siglock or tasklist_lock + * here, ptracer may attach inbetween; however, this is for + * group stop and should always be delivered to the real + * parent of the group leader. The new ptracer will get + * its notification when this task transitions into + * TASK_TRACED. + */ + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, false, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + schedule(); + + spin_lock_irq(¤t->sighand->siglock); + } else { + ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, + CLD_STOPPED, 0, NULL); + current->exit_code = 0; } - spin_unlock_irq(¤t->sighand->siglock); - if (notify) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, notify); - read_unlock(&tasklist_lock); + /* + * GROUP_STOP_PENDING could be set if another group stop has + * started since being woken up or ptrace wants us to transit + * between TASK_STOPPED and TRACED. Retry group stop. + */ + if (current->group_stop & GROUP_STOP_PENDING) { + WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); + goto retry; } - /* Now we don't run again until woken by SIGCONT or SIGKILL */ - do { - schedule(); - } while (try_to_freeze()); + /* PTRACE_ATTACH might have raced with task killing, clear trapping */ + task_clear_group_stop_trapping(current); + + spin_unlock_irq(¤t->sighand->siglock); tracehook_finish_jctl(); - current->exit_code = 0; return 1; } @@ -1814,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info, ptrace_signal_deliver(regs, cookie); /* Let the debugger run. */ - ptrace_stop(signr, 0, info); + ptrace_stop(signr, CLD_TRAPPED, 0, info); /* We're back. Did the debugger cancel the sig? */ signr = current->exit_code; @@ -1869,18 +2032,36 @@ relock: * the CLD_ si_code into SIGNAL_CLD_MASK bits. */ if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { - int why = (signal->flags & SIGNAL_STOP_CONTINUED) - ? CLD_CONTINUED : CLD_STOPPED; + struct task_struct *leader; + int why; + + if (signal->flags & SIGNAL_CLD_CONTINUED) + why = CLD_CONTINUED; + else + why = CLD_STOPPED; + signal->flags &= ~SIGNAL_CLD_MASK; - why = tracehook_notify_jctl(why, CLD_CONTINUED); spin_unlock_irq(&sighand->siglock); - if (why) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, why); - read_unlock(&tasklist_lock); - } + /* + * Notify the parent that we're continuing. This event is + * always per-process and doesn't make whole lot of sense + * for ptracers, who shouldn't consume the state via + * wait(2) either, but, for backward compatibility, notify + * the ptracer of the group leader too unless it's gonna be + * a duplicate. + */ + read_lock(&tasklist_lock); + + do_notify_parent_cldstop(current, false, why); + + leader = current->group_leader; + if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) + do_notify_parent_cldstop(leader, true, why); + + read_unlock(&tasklist_lock); + goto relock; } @@ -1897,8 +2078,8 @@ relock: if (unlikely(signr != 0)) ka = return_ka; else { - if (unlikely(signal->group_stop_count > 0) && - do_signal_stop(0)) + if (unlikely(current->group_stop & + GROUP_STOP_PENDING) && do_signal_stop(0)) goto relock; signr = dequeue_signal(current, ¤t->blocked, @@ -2017,10 +2198,42 @@ relock: return signr; } +/* + * It could be that complete_signal() picked us to notify about the + * group-wide signal. Other threads should be notified now to take + * the shared signals in @which since we will not. + */ +static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) +{ + sigset_t retarget; + struct task_struct *t; + + sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); + if (sigisemptyset(&retarget)) + return; + + t = tsk; + while_each_thread(tsk, t) { + if (t->flags & PF_EXITING) + continue; + + if (!has_pending_signals(&retarget, &t->blocked)) + continue; + /* Remove the signals this thread can handle. */ + sigandsets(&retarget, &retarget, &t->blocked); + + if (!signal_pending(t)) + signal_wake_up(t, 0); + + if (sigisemptyset(&retarget)) + break; + } +} + void exit_signals(struct task_struct *tsk) { int group_stop = 0; - struct task_struct *t; + sigset_t unblocked; if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { tsk->flags |= PF_EXITING; @@ -2036,26 +2249,23 @@ void exit_signals(struct task_struct *tsk) if (!signal_pending(tsk)) goto out; - /* - * It could be that __group_complete_signal() choose us to - * notify about group-wide signal. Another thread should be - * woken now to take the signal since we will not. - */ - for (t = tsk; (t = next_thread(t)) != tsk; ) - if (!signal_pending(t) && !(t->flags & PF_EXITING)) - recalc_sigpending_and_wake(t); + unblocked = tsk->blocked; + signotset(&unblocked); + retarget_shared_pending(tsk, &unblocked); - if (unlikely(tsk->signal->group_stop_count) && - !--tsk->signal->group_stop_count) { - tsk->signal->flags = SIGNAL_STOP_STOPPED; - group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED); - } + if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && + task_participate_group_stop(tsk)) + group_stop = CLD_STOPPED; out: spin_unlock_irq(&tsk->sighand->siglock); + /* + * If group stop has completed, deliver the notification. This + * should always go to the real parent of the group leader. + */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); - do_notify_parent_cldstop(tsk, group_stop); + do_notify_parent_cldstop(tsk, false, group_stop); read_unlock(&tasklist_lock); } } @@ -2089,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param) return -EINTR; } -/* - * We don't need to get the kernel lock - this is all local to this - * particular thread.. (and that's good, because this is _heavily_ - * used by various programs) +static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) +{ + if (signal_pending(tsk) && !thread_group_empty(tsk)) { + sigset_t newblocked; + /* A set of now blocked but previously unblocked signals. */ + sigandnsets(&newblocked, newset, ¤t->blocked); + retarget_shared_pending(tsk, &newblocked); + } + tsk->blocked = *newset; + recalc_sigpending(); +} + +/** + * set_current_blocked - change current->blocked mask + * @newset: new mask + * + * It is wrong to change ->blocked directly, this helper should be used + * to ensure the process can't miss a shared signal we are going to block. */ +void set_current_blocked(const sigset_t *newset) +{ + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, newset); + spin_unlock_irq(&tsk->sighand->siglock); +} /* * This is also useful for kernel threads that want to temporarily @@ -2105,30 +2337,29 @@ long do_no_restart_syscall(struct restart_block *param) */ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) { - int error; + struct task_struct *tsk = current; + sigset_t newset; - spin_lock_irq(¤t->sighand->siglock); + /* Lockless, only current can change ->blocked, never from irq */ if (oldset) - *oldset = current->blocked; + *oldset = tsk->blocked; - error = 0; switch (how) { case SIG_BLOCK: - sigorsets(¤t->blocked, ¤t->blocked, set); + sigorsets(&newset, &tsk->blocked, set); break; case SIG_UNBLOCK: - signandsets(¤t->blocked, ¤t->blocked, set); + sigandnsets(&newset, &tsk->blocked, set); break; case SIG_SETMASK: - current->blocked = *set; + newset = *set; break; default: - error = -EINVAL; + return -EINVAL; } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return error; + set_current_blocked(&newset); + return 0; } /** @@ -2138,40 +2369,34 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset) * @oset: previous value of signal mask if non-null * @sigsetsize: size of sigset_t type */ -SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set, +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, sigset_t __user *, oset, size_t, sigsetsize) { - int error = -EINVAL; sigset_t old_set, new_set; + int error; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) - goto out; + return -EINVAL; - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; + old_set = current->blocked; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(sigset_t))) + return -EFAULT; sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); - error = sigprocmask(how, &new_set, &old_set); + error = sigprocmask(how, &new_set, NULL); if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked; - spin_unlock_irq(¤t->sighand->siglock); + return error; + } - set_old: - error = -EFAULT; - if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; + if (oset) { + if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + return -EFAULT; } - error = 0; -out: - return error; + + return 0; } long do_sigpending(void __user *set, unsigned long sigsetsize) @@ -2284,6 +2509,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) #endif /** + * do_sigtimedwait - wait for queued signals specified in @which + * @which: queued signals to wait for + * @info: if non-null, the signal's siginfo is returned here + * @ts: upper bound on process time suspension + */ +int do_sigtimedwait(const sigset_t *which, siginfo_t *info, + const struct timespec *ts) +{ + struct task_struct *tsk = current; + long timeout = MAX_SCHEDULE_TIMEOUT; + sigset_t mask = *which; + int sig; + + if (ts) { + if (!timespec_valid(ts)) + return -EINVAL; + timeout = timespec_to_jiffies(ts); + /* + * We can be close to the next tick, add another one + * to ensure we will wait at least the time asked for. + */ + if (ts->tv_sec || ts->tv_nsec) + timeout++; + } + + /* + * Invert the set of allowed signals to get those we want to block. + */ + sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&mask); + + spin_lock_irq(&tsk->sighand->siglock); + sig = dequeue_signal(tsk, &mask, info); + if (!sig && timeout) { + /* + * None ready, temporarily unblock those we're interested + * while we are sleeping in so that we'll be awakened when + * they arrive. Unblocking is always fine, we can avoid + * set_current_blocked(). + */ + tsk->real_blocked = tsk->blocked; + sigandsets(&tsk->blocked, &tsk->blocked, &mask); + recalc_sigpending(); + spin_unlock_irq(&tsk->sighand->siglock); + + timeout = schedule_timeout_interruptible(timeout); + + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, &tsk->real_blocked); + siginitset(&tsk->real_blocked, 0); + sig = dequeue_signal(tsk, &mask, info); + } + spin_unlock_irq(&tsk->sighand->siglock); + + if (sig) + return sig; + return timeout ? -EINTR : -EAGAIN; +} + +/** * sys_rt_sigtimedwait - synchronously wait for queued signals specified * in @uthese * @uthese: queued signals to wait for @@ -2295,11 +2580,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, siginfo_t __user *, uinfo, const struct timespec __user *, uts, size_t, sigsetsize) { - int ret, sig; sigset_t these; struct timespec ts; siginfo_t info; - long timeout = 0; + int ret; /* XXX: Don't preclude handling different sized sigset_t's. */ if (sigsetsize != sizeof(sigset_t)) @@ -2308,61 +2592,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, if (copy_from_user(&these, uthese, sizeof(these))) return -EFAULT; - /* - * Invert the set of allowed signals to get those we - * want to block. - */ - sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); - signotset(&these); - if (uts) { if (copy_from_user(&ts, uts, sizeof(ts))) return -EFAULT; - if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 - || ts.tv_sec < 0) - return -EINVAL; } - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - if (!sig) { - timeout = MAX_SCHEDULE_TIMEOUT; - if (uts) - timeout = (timespec_to_jiffies(&ts) - + (ts.tv_sec || ts.tv_nsec)); - - if (timeout) { - /* - * None ready -- temporarily unblock those we're - * interested while we are sleeping in so that we'll - * be awakened when they arrive. - */ - current->real_blocked = current->blocked; - sigandsets(¤t->blocked, ¤t->blocked, &these); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - - timeout = schedule_timeout_interruptible(timeout); - - spin_lock_irq(¤t->sighand->siglock); - sig = dequeue_signal(current, &these, &info); - current->blocked = current->real_blocked; - siginitset(¤t->real_blocked, 0); - recalc_sigpending(); - } - } - spin_unlock_irq(¤t->sighand->siglock); + ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); - if (sig) { - ret = sig; - if (uinfo) { - if (copy_siginfo_to_user(uinfo, &info)) - ret = -EFAULT; - } - } else { - ret = -EAGAIN; - if (timeout) - ret = -EINTR; + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user(uinfo, &info)) + ret = -EFAULT; } return ret; @@ -2650,60 +2889,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) /** * sys_sigprocmask - examine and change blocked signals * @how: whether to add, remove, or set signals - * @set: signals to add or remove (if non-null) + * @nset: signals to add or remove (if non-null) * @oset: previous value of signal mask if non-null * * Some platforms have their own version with special arguments; * others support only sys_rt_sigprocmask. */ -SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set, +SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, old_sigset_t __user *, oset) { - int error; old_sigset_t old_set, new_set; + sigset_t new_blocked; - if (set) { - error = -EFAULT; - if (copy_from_user(&new_set, set, sizeof(*set))) - goto out; + old_set = current->blocked.sig[0]; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(*nset))) + return -EFAULT; new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); - spin_lock_irq(¤t->sighand->siglock); - old_set = current->blocked.sig[0]; + new_blocked = current->blocked; - error = 0; switch (how) { - default: - error = -EINVAL; - break; case SIG_BLOCK: - sigaddsetmask(¤t->blocked, new_set); + sigaddsetmask(&new_blocked, new_set); break; case SIG_UNBLOCK: - sigdelsetmask(¤t->blocked, new_set); + sigdelsetmask(&new_blocked, new_set); break; case SIG_SETMASK: - current->blocked.sig[0] = new_set; + new_blocked.sig[0] = new_set; break; + default: + return -EINVAL; } - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - if (error) - goto out; - if (oset) - goto set_old; - } else if (oset) { - old_set = current->blocked.sig[0]; - set_old: - error = -EFAULT; + set_current_blocked(&new_blocked); + } + + if (oset) { if (copy_to_user(oset, &old_set, sizeof(*oset))) - goto out; + return -EFAULT; } - error = 0; -out: - return error; + + return 0; } #endif /* __ARCH_WANT_SYS_SIGPROCMASK */ @@ -2793,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) SYSCALL_DEFINE0(pause) { - current->state = TASK_INTERRUPTIBLE; - schedule(); + while (!signal_pending(current)) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } return -ERESTARTNOHAND; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 25cc41cd8f33..62cbc8877fef 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt); cond_syscall(compat_sys_getsockopt); cond_syscall(sys_shutdown); cond_syscall(sys_sendmsg); +cond_syscall(sys_sendmmsg); cond_syscall(compat_sys_sendmsg); +cond_syscall(compat_sys_sendmmsg); cond_syscall(sys_recvmsg); cond_syscall(sys_recvmmsg); cond_syscall(compat_sys_recvmsg); @@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait); cond_syscall(sys_semget); cond_syscall(sys_semop); cond_syscall(sys_semtimedop); +cond_syscall(compat_sys_semtimedop); cond_syscall(sys_semctl); +cond_syscall(compat_sys_semctl); cond_syscall(sys_msgget); cond_syscall(sys_msgsnd); +cond_syscall(compat_sys_msgsnd); cond_syscall(sys_msgrcv); +cond_syscall(compat_sys_msgrcv); cond_syscall(sys_msgctl); +cond_syscall(compat_sys_msgctl); cond_syscall(sys_shmget); cond_syscall(sys_shmat); +cond_syscall(compat_sys_shmat); cond_syscall(sys_shmdt); cond_syscall(sys_shmctl); +cond_syscall(compat_sys_shmctl); cond_syscall(sys_mq_open); cond_syscall(sys_mq_unlink); cond_syscall(sys_mq_timedsend); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c0bb32414b17..4fc92445a29c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -56,6 +56,7 @@ #include <linux/kprobes.h> #include <linux/pipe_fs_i.h> #include <linux/oom.h> +#include <linux/kmod.h> #include <asm/uaccess.h> #include <asm/processor.h> @@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = { .child = random_table, }, { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, + { .procname = "overflowuid", .data = &overflowuid, .maxlen = sizeof(int), @@ -730,14 +736,16 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, { .procname = "watchdog_thresh", - .data = &softlockup_thresh, + .data = &watchdog_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dowatchdog_thresh, + .proc_handler = proc_dowatchdog, .extra1 = &neg_one, .extra2 = &sixty, }, @@ -755,7 +763,9 @@ static struct ctl_table kern_table[] = { .data = &watchdog_enabled, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = proc_dowatchdog_enabled, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, }, #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) @@ -1496,7 +1506,7 @@ static struct ctl_table fs_table[] = { static struct ctl_table debug_table[] = { #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ - defined(CONFIG_S390) + defined(CONFIG_S390) || defined(CONFIG_TILE) { .procname = "exception-trace", .data = &show_unhandled_signals, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 9265014cb4db..2d966244ea60 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -494,7 +494,7 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type, */ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) { - enum alarmtimer_type type = restart->nanosleep.index; + enum alarmtimer_type type = restart->nanosleep.clockid; ktime_t exp; struct timespec __user *rmtp; struct alarm alarm; @@ -573,7 +573,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, restart = ¤t_thread_info()->restart_block; restart->fn = alarm_timer_nsleep_restart; - restart->nanosleep.index = type; + restart->nanosleep.clockid = type; restart->nanosleep.expires = exp.tv64; restart->nanosleep.rmtp = rmtp; ret = -ERESTART_RESTARTBLOCK; @@ -669,12 +669,20 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr) */ static int __init alarmtimer_init_late(void) { + struct device *dev; char *str; /* Find an rtc device and init the rtc_timer */ - class_find_device(rtc_class, NULL, &str, has_wakealarm); - if (str) + dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); + /* If we have a device then str is valid. See has_wakealarm() */ + if (dev) { rtcdev = rtc_class_open(str); + /* + * Drop the reference we got in class_find_device, + * rtc_open takes its own. + */ + put_device(dev); + } if (!rtcdev) { printk(KERN_WARNING "No RTC device found, ALARM timers will" " not wake from suspend"); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 22a9da9a9c96..c027d4f602f1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { - unsigned long sec; + u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index d9d5f8c885f6..1c95fd677328 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -639,7 +639,7 @@ static void clocksource_enqueue(struct clocksource *cs) */ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { - unsigned long sec; + u64 sec; /* * Calc the maximum number of seconds which we can run before diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 723c7637e55a..c7218d132738 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason) unsigned long flags; int cpu; - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - /* * Periodic mode does not care about the enter/exit of power * states */ if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) - goto out; + return; - bc = tick_broadcast_device.evtdev; + /* + * We are called with preemtion disabled from the depth of the + * idle code, so we can't be moved away. + */ cpu = smp_processor_id(); td = &per_cpu(tick_cpu_device, cpu); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return; + + bc = tick_broadcast_device.evtdev; + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); @@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) tick_program_event(dev->next_event, 1); } } - -out: raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 8e6a05a5915a..342408cf68dd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -680,7 +680,7 @@ static void timekeeping_resume(void) clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); /* Resume hrtimers */ - hres_timers_resume(); + hrtimers_resume(); } static int timekeeping_suspend(void) @@ -1099,6 +1099,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } /** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ + unsigned long seq; + struct timespec wtom; + + do { + seq = read_seqbegin(&xtime_lock); + wtom = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + return timespec_to_ktime(wtom); +} + +/** * xtime_update() - advances the timekeeping infrastructure * @ticks: number of ticks, that have elapsed since the last call. * diff --git a/kernel/utsname.c b/kernel/utsname.c index 44646179eaba..bff131b9510a 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -15,6 +15,7 @@ #include <linux/err.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/proc_fs.h> static struct uts_namespace *create_uts_ns(void) { @@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref) put_user_ns(ns->user_ns); kfree(ns); } + +static void *utsns_get(struct task_struct *task) +{ + struct uts_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->uts_ns; + get_uts_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void utsns_put(void *ns) +{ + put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ + get_uts_ns(ns); + put_uts_ns(nsproxy->uts_ns); + nsproxy->uts_ns = ns; + return 0; +} + +const struct proc_ns_operations utsns_operations = { + .name = "uts", + .type = CLONE_NEWUTS, + .get = utsns_get, + .put = utsns_put, + .install = utsns_install, +}; + diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 14733d4d156b..7daa4b072e9f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -28,7 +28,7 @@ #include <linux/perf_event.h> int watchdog_enabled = 1; -int __read_mostly softlockup_thresh = 60; +int __read_mostly watchdog_thresh = 10; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); @@ -91,6 +91,17 @@ static int __init nosoftlockup_setup(char *str) __setup("nosoftlockup", nosoftlockup_setup); /* */ +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh(void) +{ + return watchdog_thresh * 2; +} /* * Returns seconds, approximately. We don't need nanosecond @@ -105,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu) static unsigned long get_sample_period(void) { /* - * convert softlockup_thresh from seconds to ns + * convert watchdog_thresh from seconds to ns * the divide by 5 is to give hrtimer 5 chances to * increment before the hardlockup detector generates * a warning */ - return softlockup_thresh / 5 * NSEC_PER_SEC; + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); } /* Commands for resetting the watchdog */ @@ -182,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts) unsigned long now = get_timestamp(smp_processor_id()); /* Warn about unreasonable delays: */ - if (time_after(now, touch_ts + softlockup_thresh)) + if (time_after(now, touch_ts + get_softlockup_thresh())) return now - touch_ts; return 0; @@ -359,7 +370,7 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; - wd_attr->sample_period = hw_nmi_get_sample_period(); + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); @@ -501,28 +512,25 @@ static void watchdog_disable_all_cpus(void) /* sysctl functions */ #ifdef CONFIG_SYSCTL /* - * proc handler for /proc/sys/kernel/nmi_watchdog + * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh */ -int proc_dowatchdog_enabled(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) +int proc_dowatchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - proc_dointvec(table, write, buffer, length, ppos); + int ret; - if (write) { - if (watchdog_enabled) - watchdog_enable_all_cpus(); - else - watchdog_disable_all_cpus(); - } - return 0; -} + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + goto out; -int proc_dowatchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) -{ - return proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (watchdog_enabled && watchdog_thresh) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + +out: + return ret; } #endif /* CONFIG_SYSCTL */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e3378e8d3a5c..0400553f0d04 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2866,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq) } } - /* just in case, make sure it's actually aligned - * - this is affected by PERCPU() alignment in vmlinux.lds.S - */ + /* just in case, make sure it's actually aligned */ BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); return wq->cpu_wq.v ? 0 : -ENOMEM; } |