summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2013-01-23 18:31:01 +0100
committerTejun Heo <tj@kernel.org>2013-01-23 18:31:01 +0100
commitc14afb82ffff5903a701a9fb737ac20f36d1f755 (patch)
tree304dcc7b1d7b9a5f564f7e978228e61ef41fbef2 /kernel
parentasync, kmod: warn on synchronous request_module() from async workers (diff)
parentMerge tag '3.8-pci-fixes-2' of git://git.kernel.org/pub/scm/linux/kernel/git/... (diff)
downloadlinux-c14afb82ffff5903a701a9fb737ac20f36d1f755.tar.xz
linux-c14afb82ffff5903a701a9fb737ac20f36d1f755.zip
Merge branch 'master' into for-3.9-async
To receive f56c3196f251012de9b3ebaff55732a9074fdaae ("async: fix __lowest_in_progress()"). Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile10
-rw-r--r--kernel/async.c30
-rw-r--r--kernel/audit.c40
-rw-r--r--kernel/audit_tree.c36
-rw-r--r--kernel/audit_watch.c6
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c20
-rw-r--r--kernel/compat.c23
-rw-r--r--kernel/cred.c27
-rw-r--r--kernel/debug/kdb/kdb_main.c2
-rw-r--r--kernel/fork.c20
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/kcmp.c1
-rw-r--r--kernel/kmod.c6
-rw-r--r--kernel/modsign_certificate.S19
-rw-r--r--kernel/modsign_pubkey.c21
-rw-r--r--kernel/module.c612
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/pid_namespace.c7
-rw-r--r--kernel/posix-cpu-timers.c3
-rw-r--r--kernel/printk.c5
-rw-r--r--kernel/ptrace.c74
-rw-r--r--kernel/res_counter.c20
-rw-r--r--kernel/rwsem.c10
-rw-r--r--kernel/sched/core.c3
-rw-r--r--kernel/sched/fair.c5
-rw-r--r--kernel/signal.c103
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/trace.c77
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/user_namespace.c2
-rw-r--r--kernel/utsname.c3
-rw-r--r--kernel/watchdog.c11
34 files changed, 832 insertions, 389 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index ac0d533eb7de..6c072b6da239 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -54,7 +54,7 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
-obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o
+obj-$(CONFIG_MODULE_SIG) += module_signing.o modsign_pubkey.o modsign_certificate.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
@@ -137,10 +137,14 @@ ifeq ($(CONFIG_MODULE_SIG),y)
#
# Pull the signing certificate and any extra certificates into the kernel
#
+
+quiet_cmd_touch = TOUCH $@
+ cmd_touch = touch $@
+
extra_certificates:
- touch $@
+ $(call cmd,touch)
-kernel/modsign_pubkey.o: signing_key.x509 extra_certificates
+kernel/modsign_certificate.o: signing_key.x509 extra_certificates
###############################################################################
#
diff --git a/kernel/async.c b/kernel/async.c
index d9bf2a9b5cee..6c68fc3fae7b 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -88,18 +88,27 @@ static atomic_t entry_count;
*/
static async_cookie_t __lowest_in_progress(struct async_domain *running)
{
+ async_cookie_t first_running = next_cookie; /* infinity value */
+ async_cookie_t first_pending = next_cookie; /* ditto */
struct async_entry *entry;
+ /*
+ * Both running and pending lists are sorted but not disjoint.
+ * Take the first cookies from both and return the min.
+ */
if (!list_empty(&running->domain)) {
entry = list_first_entry(&running->domain, typeof(*entry), list);
- return entry->cookie;
+ first_running = entry->cookie;
}
- list_for_each_entry(entry, &async_pending, list)
- if (entry->running == running)
- return entry->cookie;
+ list_for_each_entry(entry, &async_pending, list) {
+ if (entry->running == running) {
+ first_pending = entry->cookie;
+ break;
+ }
+ }
- return next_cookie; /* "infinity" value */
+ return min(first_running, first_pending);
}
static async_cookie_t lowest_in_progress(struct async_domain *running)
@@ -120,13 +129,17 @@ static void async_run_entry_fn(struct work_struct *work)
{
struct async_entry *entry =
container_of(work, struct async_entry, work);
+ struct async_entry *pos;
unsigned long flags;
ktime_t uninitialized_var(calltime), delta, rettime;
struct async_domain *running = entry->running;
- /* 1) move self to the running queue */
+ /* 1) move self to the running queue, make sure it stays sorted */
spin_lock_irqsave(&async_lock, flags);
- list_move_tail(&entry->list, &running->domain);
+ list_for_each_entry_reverse(pos, &running->domain, list)
+ if (entry->cookie < pos->cookie)
+ break;
+ list_move_tail(&entry->list, &pos->list);
spin_unlock_irqrestore(&async_lock, flags);
/* 2) run (and print duration) */
@@ -198,6 +211,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
+ /* mark that this task has queued an async job, used by module init */
+ current->flags |= PF_USED_ASYNC;
+
/* schedule for execution */
queue_work(system_unbound_wq, &entry->work);
diff --git a/kernel/audit.c b/kernel/audit.c
index 40414e9143db..d596e5355f15 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -272,6 +272,8 @@ static int audit_log_config_change(char *function_name, int new, int old,
int rc = 0;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return rc;
audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
old, from_kuid(&init_user_ns, loginuid), sessionid);
if (sid) {
@@ -619,6 +621,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
}
*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
+ if (unlikely(!*ab))
+ return rc;
audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
task_tgid_vnr(current),
from_kuid(&init_user_ns, current_uid()),
@@ -1097,6 +1101,23 @@ static inline void audit_get_stamp(struct audit_context *ctx,
}
}
+/*
+ * Wait for auditd to drain the queue a little
+ */
+static void wait_for_auditd(unsigned long sleep_time)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&audit_backlog_wait, &wait);
+
+ if (audit_backlog_limit &&
+ skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
+ schedule_timeout(sleep_time);
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+}
+
/* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
* audit_log_*format. If the tsk is a task that is currently in a
@@ -1142,20 +1163,13 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
while (audit_backlog_limit
&& skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) {
- if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time
- && time_before(jiffies, timeout_start + audit_backlog_wait_time)) {
+ if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) {
+ unsigned long sleep_time;
- /* Wait for auditd to drain the queue a little */
- DECLARE_WAITQUEUE(wait, current);
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&audit_backlog_wait, &wait);
-
- if (audit_backlog_limit &&
- skb_queue_len(&audit_skb_queue) > audit_backlog_limit)
- schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies);
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&audit_backlog_wait, &wait);
+ sleep_time = timeout_start + audit_backlog_wait_time -
+ jiffies;
+ if ((long)sleep_time > 0)
+ wait_for_auditd(sleep_time);
continue;
}
if (audit_rate_check() && printk_ratelimit())
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index ed206fd88cca..642a89c4f3d6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -249,7 +249,7 @@ static void untag_chunk(struct node *p)
list_del_rcu(&chunk->hash);
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
goto out;
}
@@ -291,7 +291,7 @@ static void untag_chunk(struct node *p)
owner->root = new;
spin_unlock(&hash_lock);
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(&new->mark); /* drop initial reference */
goto out;
@@ -331,7 +331,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
chunk->dead = 1;
spin_unlock(&entry->lock);
- fsnotify_destroy_mark(entry);
+ fsnotify_destroy_mark(entry, audit_tree_group);
fsnotify_put_mark(entry);
return 0;
}
@@ -412,7 +412,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(chunk_entry);
+ fsnotify_destroy_mark(chunk_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry);
fsnotify_put_mark(old_entry);
@@ -443,17 +443,32 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
spin_unlock(&hash_lock);
spin_unlock(&chunk_entry->lock);
spin_unlock(&old_entry->lock);
- fsnotify_destroy_mark(old_entry);
+ fsnotify_destroy_mark(old_entry, audit_tree_group);
fsnotify_put_mark(chunk_entry); /* drop initial reference */
fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */
return 0;
}
+static void audit_log_remove_rule(struct audit_krule *rule)
+{
+ struct audit_buffer *ab;
+
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "op=");
+ audit_log_string(ab, "remove rule");
+ audit_log_format(ab, " dir=");
+ audit_log_untrustedstring(ab, rule->tree->pathname);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
static void kill_rules(struct audit_tree *tree)
{
struct audit_krule *rule, *next;
struct audit_entry *entry;
- struct audit_buffer *ab;
list_for_each_entry_safe(rule, next, &tree->rules, rlist) {
entry = container_of(rule, struct audit_entry, rule);
@@ -461,14 +476,7 @@ static void kill_rules(struct audit_tree *tree)
list_del_init(&rule->rlist);
if (rule->tree) {
/* not a half-baked one */
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
- audit_log_format(ab, "op=");
- audit_log_string(ab, "remove rule");
- audit_log_format(ab, " dir=");
- audit_log_untrustedstring(ab, rule->tree->pathname);
- audit_log_key(ab, rule->filterkey);
- audit_log_format(ab, " list=%d res=1", rule->listnr);
- audit_log_end(ab);
+ audit_log_remove_rule(rule);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 9a9ae6e3d290..22831c4d369c 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -240,6 +240,8 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc
if (audit_enabled) {
struct audit_buffer *ab;
ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab, "auid=%u ses=%u op=",
from_kuid(&init_user_ns, audit_get_loginuid(current)),
audit_get_sessionid(current));
@@ -350,7 +352,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
}
mutex_unlock(&audit_filter_mutex);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
}
/* Get path information necessary for adding watches. */
@@ -457,7 +459,7 @@ void audit_remove_watch_rule(struct audit_krule *krule)
if (list_empty(&parent->watches)) {
audit_get_parent(parent);
- fsnotify_destroy_mark(&parent->mark);
+ fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7f19f23d38a3..f9fc54bbe06f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1144,7 +1144,6 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
* audit_receive_filter - apply all rules to the specified message type
* @type: audit message type
* @pid: target pid for netlink audit messages
- * @uid: target uid for netlink audit messages
* @seq: netlink audit message sequence (serial) number
* @data: payload data
* @datasz: size of payload data
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e37e6a12c5e3..a371f857a0a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1464,14 +1464,14 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_end(ab);
ab = audit_log_start(context, GFP_KERNEL,
AUDIT_IPC_SET_PERM);
+ if (unlikely(!ab))
+ return;
audit_log_format(ab,
"qbytes=%lx ouid=%u ogid=%u mode=%#ho",
context->ipc.qbytes,
context->ipc.perm_uid,
context->ipc.perm_gid,
context->ipc.perm_mode);
- if (!ab)
- return;
}
break; }
case AUDIT_MQ_OPEN: {
@@ -2675,7 +2675,7 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
-static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+static void audit_log_task(struct audit_buffer *ab)
{
kuid_t auid, uid;
kgid_t gid;
@@ -2693,6 +2693,11 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
audit_log_task_context(ab);
audit_log_format(ab, " pid=%d comm=", current->pid);
audit_log_untrustedstring(ab, current->comm);
+}
+
+static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr)
+{
+ audit_log_task(ab);
audit_log_format(ab, " reason=");
audit_log_string(ab, reason);
audit_log_format(ab, " sig=%ld", signr);
@@ -2715,6 +2720,8 @@ void audit_core_dumps(long signr)
return;
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
+ if (unlikely(!ab))
+ return;
audit_log_abend(ab, "memory violation", signr);
audit_log_end(ab);
}
@@ -2723,8 +2730,11 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
{
struct audit_buffer *ab;
- ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
- audit_log_abend(ab, "seccomp", signr);
+ ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_SECCOMP);
+ if (unlikely(!ab))
+ return;
+ audit_log_task(ab);
+ audit_log_format(ab, " sig=%ld", signr);
audit_log_format(ab, " syscall=%ld", syscall);
audit_log_format(ab, " compat=%d", is_compat_task());
audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
diff --git a/kernel/compat.c b/kernel/compat.c
index f6150e92dfc9..36700e9e2be9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru)
return 0;
}
-asmlinkage long
-compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
- struct compat_rusage __user *ru)
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
{
if (!ru) {
return sys_wait4(pid, stat_addr, options, NULL);
@@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options,
}
}
-asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
- struct compat_siginfo __user *uinfo, int options,
- struct compat_rusage __user *uru)
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, uinfo, int, options,
+ struct compat_rusage __user *, uru)
{
siginfo_t info;
struct rusage ru;
@@ -584,7 +587,11 @@ asmlinkage long compat_sys_waitid(int which, compat_pid_t pid,
return ret;
if (uru) {
- ret = put_compat_rusage(&ru, uru);
+ /* sys_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ ret = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ ret = put_compat_rusage(&ru, uru);
if (ret)
return ret;
}
@@ -994,7 +1001,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
sigset_from_compat(&s, &s32);
if (uts) {
- if (get_compat_timespec(&t, uts))
+ if (compat_get_timespec(&t, uts))
return -EFAULT;
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 8888afb846e9..e0573a43c7df 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -372,6 +372,31 @@ error_put:
return ret;
}
+static bool cred_cap_issubset(const struct cred *set, const struct cred *subset)
+{
+ const struct user_namespace *set_ns = set->user_ns;
+ const struct user_namespace *subset_ns = subset->user_ns;
+
+ /* If the two credentials are in the same user namespace see if
+ * the capabilities of subset are a subset of set.
+ */
+ if (set_ns == subset_ns)
+ return cap_issubset(subset->cap_permitted, set->cap_permitted);
+
+ /* The credentials are in a different user namespaces
+ * therefore one is a subset of the other only if a set is an
+ * ancestor of subset and set->euid is owner of subset or one
+ * of subsets ancestors.
+ */
+ for (;subset_ns != &init_user_ns; subset_ns = subset_ns->parent) {
+ if ((set_ns == subset_ns->parent) &&
+ uid_eq(subset_ns->owner, set->euid))
+ return true;
+ }
+
+ return false;
+}
+
/**
* commit_creds - Install new credentials upon the current task
* @new: The credentials to be assigned
@@ -410,7 +435,7 @@ int commit_creds(struct cred *new)
!gid_eq(old->egid, new->egid) ||
!uid_eq(old->fsuid, new->fsuid) ||
!gid_eq(old->fsgid, new->fsgid) ||
- !cap_issubset(new->cap_permitted, old->cap_permitted)) {
+ !cred_cap_issubset(old, new)) {
if (task->mm)
set_dumpable(task->mm, suid_dumpable);
task->pdeath_signal = 0;
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index 4d5f8d5612f3..8875254120b6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv)
kdb_printf("Module Size modstruct Used by\n");
list_for_each_entry(mod, kdb_modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
kdb_printf("%-20s%8u 0x%p ", mod->name,
mod->core_size, (void *)mod);
diff --git a/kernel/fork.c b/kernel/fork.c
index c36c4e301efe..c535f33bbb9c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -146,7 +146,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
int node)
{
- struct page *page = alloc_pages_node(node, THREADINFO_GFP,
+ struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
THREAD_SIZE_ORDER);
return page ? page_address(page) : NULL;
@@ -154,7 +154,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
- free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
static struct kmem_cache *thread_info_cache;
@@ -1166,6 +1166,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
current->signal->flags & SIGNAL_UNKILLABLE)
return ERR_PTR(-EINVAL);
+ /*
+ * If the new process will be in a different pid namespace
+ * don't allow the creation of threads.
+ */
+ if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
+ (task_active_pid_ns(current) != current->nsproxy->pid_ns))
+ return ERR_PTR(-EINVAL);
+
retval = security_task_create(clone_flags);
if (retval)
goto fork_out;
@@ -1613,7 +1621,6 @@ long do_fork(unsigned long clone_flags,
return nr;
}
-#ifdef CONFIG_GENERIC_KERNEL_THREAD
/*
* Create a kernel thread.
*/
@@ -1622,7 +1629,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
(unsigned long)arg, NULL, NULL);
}
-#endif
#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
@@ -1662,8 +1668,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int, tls_val)
#endif
{
- return do_fork(clone_flags, newsp, 0,
- parent_tidptr, child_tidptr);
+ long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
+ asmlinkage_protect(5, ret, clone_flags, newsp,
+ parent_tidptr, child_tidptr, tls_val);
+ return ret;
}
#endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 35c70c9e24d8..e49a288fa479 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -818,7 +818,7 @@ static void irq_thread_dtor(struct callback_head *unused)
action = kthread_data(tsk);
pr_err("exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
- tsk->comm ? tsk->comm : "", tsk->pid, action->irq);
+ tsk->comm, tsk->pid, action->irq);
desc = irq_to_desc(action->irq);
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 30b7b225306c..e30ac0fe61c3 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -4,6 +4,7 @@
#include <linux/string.h>
#include <linux/random.h>
#include <linux/module.h>
+#include <linux/ptrace.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/cache.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ecd42b484db8..56dd34976d7b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -228,9 +228,9 @@ static int ____call_usermodehelper(void *data)
commit_creds(new);
- retval = kernel_execve(sub_info->path,
- (const char *const *)sub_info->argv,
- (const char *const *)sub_info->envp);
+ retval = do_execve(sub_info->path,
+ (const char __user *const __user *)sub_info->argv,
+ (const char __user *const __user *)sub_info->envp);
if (!retval)
return 0;
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
new file mode 100644
index 000000000000..246b4c6e6135
--- /dev/null
+++ b/kernel/modsign_certificate.S
@@ -0,0 +1,19 @@
+/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
+#ifndef SYMBOL_PREFIX
+#define ASM_SYMBOL(sym) sym
+#else
+#define PASTE2(x,y) x##y
+#define PASTE(x,y) PASTE2(x,y)
+#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
+#endif
+
+#define GLOBAL(name) \
+ .globl ASM_SYMBOL(name); \
+ ASM_SYMBOL(name):
+
+ .section ".init.data","aw"
+
+GLOBAL(modsign_certificate_list)
+ .incbin "signing_key.x509"
+ .incbin "extra_certificates"
+GLOBAL(modsign_certificate_list_end)
diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 767e559dfb10..2b6e69909c39 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -20,12 +20,6 @@ struct key *modsign_keyring;
extern __initdata const u8 modsign_certificate_list[];
extern __initdata const u8 modsign_certificate_list_end[];
-asm(".section .init.data,\"aw\"\n"
- SYMBOL_PREFIX "modsign_certificate_list:\n"
- ".incbin \"signing_key.x509\"\n"
- ".incbin \"extra_certificates\"\n"
- SYMBOL_PREFIX "modsign_certificate_list_end:"
- );
/*
* We need to make sure ccache doesn't cache the .o file as it doesn't notice
@@ -40,18 +34,15 @@ static __init int module_verify_init(void)
{
pr_notice("Initialise module verification\n");
- modsign_keyring = key_alloc(&key_type_keyring, ".module_sign",
- KUIDT_INIT(0), KGIDT_INIT(0),
- current_cred(),
- (KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ,
- KEY_ALLOC_NOT_IN_QUOTA);
+ modsign_keyring = keyring_alloc(".module_sign",
+ KUIDT_INIT(0), KGIDT_INIT(0),
+ current_cred(),
+ ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
+ KEY_USR_VIEW | KEY_USR_READ),
+ KEY_ALLOC_NOT_IN_QUOTA, NULL);
if (IS_ERR(modsign_keyring))
panic("Can't allocate module signing keyring\n");
- if (key_instantiate_and_link(modsign_keyring, NULL, 0, NULL, NULL) < 0)
- panic("Can't instantiate module signing keyring\n");
-
return 0;
}
diff --git a/kernel/module.c b/kernel/module.c
index 808bd62e1723..eab08274ec9b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -21,6 +21,7 @@
#include <linux/ftrace_event.h>
#include <linux/init.h>
#include <linux/kallsyms.h>
+#include <linux/file.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kernel.h>
@@ -28,6 +29,7 @@
#include <linux/vmalloc.h>
#include <linux/elf.h>
#include <linux/proc_fs.h>
+#include <linux/security.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/fcntl.h>
@@ -59,6 +61,7 @@
#include <linux/pfn.h>
#include <linux/bsearch.h>
#include <linux/fips.h>
+#include <uapi/linux/module.h>
#include "module-internal.h"
#define CREATE_TRACE_POINTS
@@ -185,6 +188,7 @@ struct load_info {
ongoing or failed initialization etc. */
static inline int strong_try_module_get(struct module *mod)
{
+ BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED);
if (mod && mod->state == MODULE_STATE_COMING)
return -EBUSY;
if (try_module_get(mod))
@@ -340,6 +344,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
#endif
};
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
+
if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
return true;
}
@@ -447,16 +454,24 @@ const struct kernel_symbol *find_symbol(const char *name,
EXPORT_SYMBOL_GPL(find_symbol);
/* Search for module by name: must hold module_mutex. */
-struct module *find_module(const char *name)
+static struct module *find_module_all(const char *name,
+ bool even_unformed)
{
struct module *mod;
list_for_each_entry(mod, &modules, list) {
+ if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (strcmp(mod->name, name) == 0)
return mod;
}
return NULL;
}
+
+struct module *find_module(const char *name)
+{
+ return find_module_all(name, false);
+}
EXPORT_SYMBOL_GPL(find_module);
#ifdef CONFIG_SMP
@@ -522,6 +537,8 @@ bool is_module_percpu_address(unsigned long addr)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (!mod->percpu_size)
continue;
for_each_possible_cpu(cpu) {
@@ -1045,6 +1062,8 @@ static ssize_t show_initstate(struct module_attribute *mattr,
case MODULE_STATE_GOING:
state = "going";
break;
+ default:
+ BUG();
}
return sprintf(buffer, "%s\n", state);
}
@@ -1783,6 +1802,8 @@ void set_all_modules_text_rw(void)
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((mod->module_core) && (mod->core_text_size)) {
set_page_attributes(mod->module_core,
mod->module_core + mod->core_text_size,
@@ -1804,6 +1825,8 @@ void set_all_modules_text_ro(void)
mutex_lock(&module_mutex);
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((mod->module_core) && (mod->core_text_size)) {
set_page_attributes(mod->module_core,
mod->module_core + mod->core_text_size,
@@ -2279,7 +2302,7 @@ static void layout_symtab(struct module *mod, struct load_info *info)
Elf_Shdr *symsect = info->sechdrs + info->index.sym;
Elf_Shdr *strsect = info->sechdrs + info->index.str;
const Elf_Sym *src;
- unsigned int i, nsrc, ndst, strtab_size;
+ unsigned int i, nsrc, ndst, strtab_size = 0;
/* Put symbol section at end of init part of module. */
symsect->sh_flags |= SHF_ALLOC;
@@ -2290,9 +2313,6 @@ static void layout_symtab(struct module *mod, struct load_info *info)
src = (void *)info->hdr + symsect->sh_offset;
nsrc = symsect->sh_size / sizeof(*src);
- /* strtab always starts with a nul, so offset 0 is the empty string. */
- strtab_size = 1;
-
/* Compute total space required for the core symbols' strtab. */
for (ndst = i = 0; i < nsrc; i++) {
if (i == 0 ||
@@ -2334,7 +2354,6 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
mod->core_symtab = dst = mod->module_core + info->symoffs;
mod->core_strtab = s = mod->module_core + info->stroffs;
src = mod->symtab;
- *s++ = 0;
for (ndst = i = 0; i < mod->num_symtab; i++) {
if (i == 0 ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) {
@@ -2375,7 +2394,7 @@ static void dynamic_debug_remove(struct _ddebug *debug)
void * __weak module_alloc(unsigned long size)
{
- return size == 0 ? NULL : vmalloc_exec(size);
+ return vmalloc_exec(size);
}
static void *module_alloc_update_bounds(unsigned long size)
@@ -2422,18 +2441,17 @@ static inline void kmemleak_load_module(const struct module *mod,
#endif
#ifdef CONFIG_MODULE_SIG
-static int module_sig_check(struct load_info *info,
- const void *mod, unsigned long *_len)
+static int module_sig_check(struct load_info *info)
{
int err = -ENOKEY;
- unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
- unsigned long len = *_len;
+ const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1;
+ const void *mod = info->hdr;
- if (len > markerlen &&
- memcmp(mod + len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
+ if (info->len > markerlen &&
+ memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) {
/* We truncate the module to discard the signature */
- *_len -= markerlen;
- err = mod_verify_sig(mod, _len);
+ info->len -= markerlen;
+ err = mod_verify_sig(mod, &info->len);
}
if (!err) {
@@ -2451,59 +2469,114 @@ static int module_sig_check(struct load_info *info,
return err;
}
#else /* !CONFIG_MODULE_SIG */
-static int module_sig_check(struct load_info *info,
- void *mod, unsigned long *len)
+static int module_sig_check(struct load_info *info)
{
return 0;
}
#endif /* !CONFIG_MODULE_SIG */
-/* Sets info->hdr, info->len and info->sig_ok. */
-static int copy_and_check(struct load_info *info,
- const void __user *umod, unsigned long len,
- const char __user *uargs)
+/* Sanity checks against invalid binaries, wrong arch, weird elf version. */
+static int elf_header_check(struct load_info *info)
+{
+ if (info->len < sizeof(*(info->hdr)))
+ return -ENOEXEC;
+
+ if (memcmp(info->hdr->e_ident, ELFMAG, SELFMAG) != 0
+ || info->hdr->e_type != ET_REL
+ || !elf_check_arch(info->hdr)
+ || info->hdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (info->hdr->e_shoff >= info->len
+ || (info->hdr->e_shnum * sizeof(Elf_Shdr) >
+ info->len - info->hdr->e_shoff))
+ return -ENOEXEC;
+
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_user(const void __user *umod, unsigned long len,
+ struct load_info *info)
{
int err;
- Elf_Ehdr *hdr;
- if (len < sizeof(*hdr))
+ info->len = len;
+ if (info->len < sizeof(*(info->hdr)))
return -ENOEXEC;
+ err = security_kernel_module_from_file(NULL);
+ if (err)
+ return err;
+
/* Suck in entire file: we'll want most of it. */
- if ((hdr = vmalloc(len)) == NULL)
+ info->hdr = vmalloc(info->len);
+ if (!info->hdr)
return -ENOMEM;
- if (copy_from_user(hdr, umod, len) != 0) {
- err = -EFAULT;
- goto free_hdr;
+ if (copy_from_user(info->hdr, umod, info->len) != 0) {
+ vfree(info->hdr);
+ return -EFAULT;
}
- err = module_sig_check(info, hdr, &len);
+ return 0;
+}
+
+/* Sets info->hdr and info->len. */
+static int copy_module_from_fd(int fd, struct load_info *info)
+{
+ struct file *file;
+ int err;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
+
+ file = fget(fd);
+ if (!file)
+ return -ENOEXEC;
+
+ err = security_kernel_module_from_file(file);
+ if (err)
+ goto out;
+
+ err = vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
if (err)
- goto free_hdr;
+ goto out;
- /* Sanity checks against insmoding binaries or wrong arch,
- weird elf version */
- if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
- || hdr->e_type != ET_REL
- || !elf_check_arch(hdr)
- || hdr->e_shentsize != sizeof(Elf_Shdr)) {
- err = -ENOEXEC;
- goto free_hdr;
+ if (stat.size > INT_MAX) {
+ err = -EFBIG;
+ goto out;
}
- if (hdr->e_shoff >= len ||
- hdr->e_shnum * sizeof(Elf_Shdr) > len - hdr->e_shoff) {
- err = -ENOEXEC;
- goto free_hdr;
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ err = -EINVAL;
+ goto out;
}
- info->hdr = hdr;
- info->len = len;
- return 0;
+ info->hdr = vmalloc(stat.size);
+ if (!info->hdr) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(info->hdr);
+ err = bytes;
+ goto out;
+ }
+ if (bytes == 0)
+ break;
+ pos += bytes;
+ }
+ info->len = pos;
-free_hdr:
- vfree(hdr);
+out:
+ fput(file);
return err;
}
@@ -2512,7 +2585,7 @@ static void free_copy(struct load_info *info)
vfree(info->hdr);
}
-static int rewrite_section_headers(struct load_info *info)
+static int rewrite_section_headers(struct load_info *info, int flags)
{
unsigned int i;
@@ -2540,7 +2613,10 @@ static int rewrite_section_headers(struct load_info *info)
}
/* Track but don't keep modinfo and version sections. */
- info->index.vers = find_sec(info, "__versions");
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
+ info->index.vers = 0; /* Pretend no __versions section! */
+ else
+ info->index.vers = find_sec(info, "__versions");
info->index.info = find_sec(info, ".modinfo");
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -2555,7 +2631,7 @@ static int rewrite_section_headers(struct load_info *info)
* Return the temporary module pointer (we'll replace it with the final
* one when we move the module sections around).
*/
-static struct module *setup_load_info(struct load_info *info)
+static struct module *setup_load_info(struct load_info *info, int flags)
{
unsigned int i;
int err;
@@ -2566,7 +2642,7 @@ static struct module *setup_load_info(struct load_info *info)
info->secstrings = (void *)info->hdr
+ info->sechdrs[info->hdr->e_shstrndx].sh_offset;
- err = rewrite_section_headers(info);
+ err = rewrite_section_headers(info, flags);
if (err)
return ERR_PTR(err);
@@ -2604,11 +2680,14 @@ static struct module *setup_load_info(struct load_info *info)
return mod;
}
-static int check_modinfo(struct module *mod, struct load_info *info)
+static int check_modinfo(struct module *mod, struct load_info *info, int flags)
{
const char *modmagic = get_modinfo(info, "vermagic");
int err;
+ if (flags & MODULE_INIT_IGNORE_VERMAGIC)
+ modmagic = NULL;
+
/* This is allowed: modprobe --force will invalidate it. */
if (!modmagic) {
err = try_to_force_load(mod, "bad vermagic");
@@ -2738,20 +2817,23 @@ static int move_module(struct module *mod, struct load_info *info)
memset(ptr, 0, mod->core_size);
mod->module_core = ptr;
- ptr = module_alloc_update_bounds(mod->init_size);
- /*
- * The pointer to this block is stored in the module structure
- * which is inside the block. This block doesn't need to be
- * scanned as it contains data and code that will be freed
- * after the module is initialized.
- */
- kmemleak_ignore(ptr);
- if (!ptr && mod->init_size) {
- module_free(mod, mod->module_core);
- return -ENOMEM;
- }
- memset(ptr, 0, mod->init_size);
- mod->module_init = ptr;
+ if (mod->init_size) {
+ ptr = module_alloc_update_bounds(mod->init_size);
+ /*
+ * The pointer to this block is stored in the module structure
+ * which is inside the block. This block doesn't need to be
+ * scanned as it contains data and code that will be freed
+ * after the module is initialized.
+ */
+ kmemleak_ignore(ptr);
+ if (!ptr) {
+ module_free(mod, mod->module_core);
+ return -ENOMEM;
+ }
+ memset(ptr, 0, mod->init_size);
+ mod->module_init = ptr;
+ } else
+ mod->module_init = NULL;
/* Transfer each section which specifies SHF_ALLOC */
pr_debug("final section addresses:\n");
@@ -2844,18 +2926,18 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
return 0;
}
-static struct module *layout_and_allocate(struct load_info *info)
+static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
Elf_Shdr *pcpusec;
int err;
- mod = setup_load_info(info);
+ mod = setup_load_info(info, flags);
if (IS_ERR(mod))
return mod;
- err = check_modinfo(mod, info);
+ err = check_modinfo(mod, info, flags);
if (err)
return ERR_PTR(err);
@@ -2935,40 +3017,181 @@ static bool finished_loading(const char *name)
bool ret;
mutex_lock(&module_mutex);
- mod = find_module(name);
- ret = !mod || mod->state != MODULE_STATE_COMING;
+ mod = find_module_all(name, true);
+ ret = !mod || mod->state == MODULE_STATE_LIVE
+ || mod->state == MODULE_STATE_GOING;
mutex_unlock(&module_mutex);
return ret;
}
+/* Call module constructors. */
+static void do_mod_ctors(struct module *mod)
+{
+#ifdef CONFIG_CONSTRUCTORS
+ unsigned long i;
+
+ for (i = 0; i < mod->num_ctors; i++)
+ mod->ctors[i]();
+#endif
+}
+
+/* This is where the real work happens */
+static int do_init_module(struct module *mod)
+{
+ int ret = 0;
+
+ /*
+ * We want to find out whether @mod uses async during init. Clear
+ * PF_USED_ASYNC. async_schedule*() will set it.
+ */
+ current->flags &= ~PF_USED_ASYNC;
+
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_COMING, mod);
+
+ /* Set RO and NX regions for core */
+ set_section_ro_nx(mod->module_core,
+ mod->core_text_size,
+ mod->core_ro_size,
+ mod->core_size);
+
+ /* Set RO and NX regions for init */
+ set_section_ro_nx(mod->module_init,
+ mod->init_text_size,
+ mod->init_ro_size,
+ mod->init_size);
+
+ do_mod_ctors(mod);
+ /* Start the module */
+ if (mod->init != NULL)
+ ret = do_one_initcall(mod->init);
+ if (ret < 0) {
+ /* Init routine failed: abort. Try to protect us from
+ buggy refcounters. */
+ mod->state = MODULE_STATE_GOING;
+ synchronize_sched();
+ module_put(mod);
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_GOING, mod);
+ free_module(mod);
+ wake_up_all(&module_wq);
+ return ret;
+ }
+ if (ret > 0) {
+ printk(KERN_WARNING
+"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
+"%s: loading module anyway...\n",
+ __func__, mod->name, ret,
+ __func__);
+ dump_stack();
+ }
+
+ /* Now it's a first class citizen! */
+ mod->state = MODULE_STATE_LIVE;
+ blocking_notifier_call_chain(&module_notify_list,
+ MODULE_STATE_LIVE, mod);
+
+ /*
+ * We need to finish all async code before the module init sequence
+ * is done. This has potential to deadlock. For example, a newly
+ * detected block device can trigger request_module() of the
+ * default iosched from async probing task. Once userland helper
+ * reaches here, async_synchronize_full() will wait on the async
+ * task waiting on request_module() and deadlock.
+ *
+ * This deadlock is avoided by perfomring async_synchronize_full()
+ * iff module init queued any async jobs. This isn't a full
+ * solution as it will deadlock the same if module loading from
+ * async jobs nests more than once; however, due to the various
+ * constraints, this hack seems to be the best option for now.
+ * Please refer to the following thread for details.
+ *
+ * http://thread.gmane.org/gmane.linux.kernel/1420814
+ */
+ if (current->flags & PF_USED_ASYNC)
+ async_synchronize_full();
+
+ mutex_lock(&module_mutex);
+ /* Drop initial reference. */
+ module_put(mod);
+ trim_init_extable(mod);
+#ifdef CONFIG_KALLSYMS
+ mod->num_symtab = mod->core_num_syms;
+ mod->symtab = mod->core_symtab;
+ mod->strtab = mod->core_strtab;
+#endif
+ unset_module_init_ro_nx(mod);
+ module_free(mod, mod->module_init);
+ mod->module_init = NULL;
+ mod->init_size = 0;
+ mod->init_ro_size = 0;
+ mod->init_text_size = 0;
+ mutex_unlock(&module_mutex);
+ wake_up_all(&module_wq);
+
+ return 0;
+}
+
+static int may_init_module(void)
+{
+ if (!capable(CAP_SYS_MODULE) || modules_disabled)
+ return -EPERM;
+
+ return 0;
+}
+
/* Allocate and load the module: note that size of section 0 is always
zero, and we rely on this for optional sections. */
-static struct module *load_module(void __user *umod,
- unsigned long len,
- const char __user *uargs)
+static int load_module(struct load_info *info, const char __user *uargs,
+ int flags)
{
- struct load_info info = { NULL, };
struct module *mod, *old;
long err;
- pr_debug("load_module: umod=%p, len=%lu, uargs=%p\n",
- umod, len, uargs);
+ err = module_sig_check(info);
+ if (err)
+ goto free_copy;
- /* Copy in the blobs from userspace, check they are vaguely sane. */
- err = copy_and_check(&info, umod, len, uargs);
+ err = elf_header_check(info);
if (err)
- return ERR_PTR(err);
+ goto free_copy;
/* Figure out module layout, and allocate all the memory. */
- mod = layout_and_allocate(&info);
+ mod = layout_and_allocate(info, flags);
if (IS_ERR(mod)) {
err = PTR_ERR(mod);
goto free_copy;
}
+ /*
+ * We try to place it in the list now to make sure it's unique
+ * before we dedicate too many resources. In particular,
+ * temporary percpu memory exhaustion.
+ */
+ mod->state = MODULE_STATE_UNFORMED;
+again:
+ mutex_lock(&module_mutex);
+ if ((old = find_module_all(mod->name, true)) != NULL) {
+ if (old->state == MODULE_STATE_COMING
+ || old->state == MODULE_STATE_UNFORMED) {
+ /* Wait in case it fails to load. */
+ mutex_unlock(&module_mutex);
+ err = wait_event_interruptible(module_wq,
+ finished_loading(mod->name));
+ if (err)
+ goto free_module;
+ goto again;
+ }
+ err = -EEXIST;
+ mutex_unlock(&module_mutex);
+ goto free_module;
+ }
+ list_add_rcu(&mod->list, &modules);
+ mutex_unlock(&module_mutex);
+
#ifdef CONFIG_MODULE_SIG
- mod->sig_ok = info.sig_ok;
+ mod->sig_ok = info->sig_ok;
if (!mod->sig_ok)
add_taint_module(mod, TAINT_FORCED_MODULE);
#endif
@@ -2976,29 +3199,29 @@ static struct module *load_module(void __user *umod,
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
- goto free_module;
+ goto unlink_mod;
/* Now we've got everything in the final locations, we can
* find optional sections. */
- find_module_sections(mod, &info);
+ find_module_sections(mod, info);
err = check_module_license_and_versions(mod);
if (err)
goto free_unload;
/* Set up MODINFO_ATTR fields */
- setup_modinfo(mod, &info);
+ setup_modinfo(mod, info);
/* Fix up syms, so that st_value is a pointer to location. */
- err = simplify_symbols(mod, &info);
+ err = simplify_symbols(mod, info);
if (err < 0)
goto free_modinfo;
- err = apply_relocations(mod, &info);
+ err = apply_relocations(mod, info);
if (err < 0)
goto free_modinfo;
- err = post_relocation(mod, &info);
+ err = post_relocation(mod, info);
if (err < 0)
goto free_modinfo;
@@ -3011,72 +3234,49 @@ static struct module *load_module(void __user *umod,
goto free_arch_cleanup;
}
- /* Mark state as coming so strong_try_module_get() ignores us. */
- mod->state = MODULE_STATE_COMING;
+ dynamic_debug_setup(info->debug, info->num_debug);
- /* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. No one should access us, since
- * strong_try_module_get() will fail.
- * lockdep/oops can run asynchronous, so use the RCU list insertion
- * function to insert in a way safe to concurrent readers.
- * The mutex protects against concurrent writers.
- */
-again:
mutex_lock(&module_mutex);
- if ((old = find_module(mod->name)) != NULL) {
- if (old->state == MODULE_STATE_COMING) {
- /* Wait in case it fails to load. */
- mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
- if (err)
- goto free_arch_cleanup;
- goto again;
- }
- err = -EEXIST;
- goto unlock;
- }
-
- /* This has to be done once we're sure module name is unique. */
- dynamic_debug_setup(info.debug, info.num_debug);
-
- /* Find duplicate symbols */
+ /* Find duplicate symbols (must be called under lock). */
err = verify_export_symbols(mod);
if (err < 0)
- goto ddebug;
+ goto ddebug_cleanup;
+
+ /* This relies on module_mutex for list integrity. */
+ module_bug_finalize(info->hdr, info->sechdrs, mod);
+
+ /* Mark state as coming so strong_try_module_get() ignores us,
+ * but kallsyms etc. can see us. */
+ mod->state = MODULE_STATE_COMING;
- module_bug_finalize(info.hdr, info.sechdrs, mod);
- list_add_rcu(&mod->list, &modules);
mutex_unlock(&module_mutex);
/* Module is ready to execute: parsing args may do that. */
err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
-32768, 32767, &ddebug_dyndbg_module_param_cb);
if (err < 0)
- goto unlink;
+ goto bug_cleanup;
/* Link in to syfs. */
- err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp);
+ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp);
if (err < 0)
- goto unlink;
+ goto bug_cleanup;
/* Get rid of temporary copy. */
- free_copy(&info);
+ free_copy(info);
/* Done! */
trace_module_load(mod);
- return mod;
- unlink:
+ return do_init_module(mod);
+
+ bug_cleanup:
+ /* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
- /* Unlink carefully: kallsyms could be walking list. */
- list_del_rcu(&mod->list);
module_bug_cleanup(mod);
- wake_up_all(&module_wq);
- ddebug:
- dynamic_debug_remove(info.debug);
- unlock:
+ ddebug_cleanup:
mutex_unlock(&module_mutex);
+ dynamic_debug_remove(info->debug);
synchronize_sched();
kfree(mod->args);
free_arch_cleanup:
@@ -3085,107 +3285,59 @@ again:
free_modinfo(mod);
free_unload:
module_unload_free(mod);
+ unlink_mod:
+ mutex_lock(&module_mutex);
+ /* Unlink carefully: kallsyms could be walking list. */
+ list_del_rcu(&mod->list);
+ wake_up_all(&module_wq);
+ mutex_unlock(&module_mutex);
free_module:
- module_deallocate(mod, &info);
+ module_deallocate(mod, info);
free_copy:
- free_copy(&info);
- return ERR_PTR(err);
-}
-
-/* Call module constructors. */
-static void do_mod_ctors(struct module *mod)
-{
-#ifdef CONFIG_CONSTRUCTORS
- unsigned long i;
-
- for (i = 0; i < mod->num_ctors; i++)
- mod->ctors[i]();
-#endif
+ free_copy(info);
+ return err;
}
-/* This is where the real work happens */
SYSCALL_DEFINE3(init_module, void __user *, umod,
unsigned long, len, const char __user *, uargs)
{
- struct module *mod;
- int ret = 0;
+ int err;
+ struct load_info info = { };
- /* Must have permission */
- if (!capable(CAP_SYS_MODULE) || modules_disabled)
- return -EPERM;
+ err = may_init_module();
+ if (err)
+ return err;
- /* Do all the hard work */
- mod = load_module(umod, len, uargs);
- if (IS_ERR(mod))
- return PTR_ERR(mod);
+ pr_debug("init_module: umod=%p, len=%lu, uargs=%p\n",
+ umod, len, uargs);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_COMING, mod);
+ err = copy_module_from_user(umod, len, &info);
+ if (err)
+ return err;
- /* Set RO and NX regions for core */
- set_section_ro_nx(mod->module_core,
- mod->core_text_size,
- mod->core_ro_size,
- mod->core_size);
+ return load_module(&info, uargs, 0);
+}
- /* Set RO and NX regions for init */
- set_section_ro_nx(mod->module_init,
- mod->init_text_size,
- mod->init_ro_size,
- mod->init_size);
+SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags)
+{
+ int err;
+ struct load_info info = { };
- do_mod_ctors(mod);
- /* Start the module */
- if (mod->init != NULL)
- ret = do_one_initcall(mod->init);
- if (ret < 0) {
- /* Init routine failed: abort. Try to protect us from
- buggy refcounters. */
- mod->state = MODULE_STATE_GOING;
- synchronize_sched();
- module_put(mod);
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_GOING, mod);
- free_module(mod);
- wake_up_all(&module_wq);
- return ret;
- }
- if (ret > 0) {
- printk(KERN_WARNING
-"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n"
-"%s: loading module anyway...\n",
- __func__, mod->name, ret,
- __func__);
- dump_stack();
- }
+ err = may_init_module();
+ if (err)
+ return err;
- /* Now it's a first class citizen! */
- mod->state = MODULE_STATE_LIVE;
- blocking_notifier_call_chain(&module_notify_list,
- MODULE_STATE_LIVE, mod);
+ pr_debug("finit_module: fd=%d, uargs=%p, flags=%i\n", fd, uargs, flags);
- /* We need to finish all async code before the module init sequence is done */
- async_synchronize_full();
+ if (flags & ~(MODULE_INIT_IGNORE_MODVERSIONS
+ |MODULE_INIT_IGNORE_VERMAGIC))
+ return -EINVAL;
- mutex_lock(&module_mutex);
- /* Drop initial reference. */
- module_put(mod);
- trim_init_extable(mod);
-#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
-#endif
- unset_module_init_ro_nx(mod);
- module_free(mod, mod->module_init);
- mod->module_init = NULL;
- mod->init_size = 0;
- mod->init_ro_size = 0;
- mod->init_text_size = 0;
- mutex_unlock(&module_mutex);
- wake_up_all(&module_wq);
+ err = copy_module_from_fd(fd, &info);
+ if (err)
+ return err;
- return 0;
+ return load_module(&info, uargs, flags);
}
static inline int within(unsigned long addr, void *start, unsigned long size)
@@ -3261,6 +3413,8 @@ const char *module_address_lookup(unsigned long addr,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
if (modname)
@@ -3284,6 +3438,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
const char *sym;
@@ -3308,6 +3464,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_init(addr, mod) ||
within_module_core(addr, mod)) {
const char *sym;
@@ -3335,6 +3493,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (symnum < mod->num_symtab) {
*value = mod->symtab[symnum].st_value;
*type = mod->symtab[symnum].st_info;
@@ -3377,9 +3537,12 @@ unsigned long module_kallsyms_lookup_name(const char *name)
ret = mod_find_symname(mod, colon+1);
*colon = ':';
} else {
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if ((ret = mod_find_symname(mod, name)) != 0)
break;
+ }
}
preempt_enable();
return ret;
@@ -3394,6 +3557,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
int ret;
list_for_each_entry(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
for (i = 0; i < mod->num_symtab; i++) {
ret = fn(data, mod->strtab + mod->symtab[i].st_name,
mod, mod->symtab[i].st_value);
@@ -3409,6 +3574,7 @@ static char *module_flags(struct module *mod, char *buf)
{
int bx = 0;
+ BUG_ON(mod->state == MODULE_STATE_UNFORMED);
if (mod->taints ||
mod->state == MODULE_STATE_GOING ||
mod->state == MODULE_STATE_COMING) {
@@ -3450,6 +3616,10 @@ static int m_show(struct seq_file *m, void *p)
struct module *mod = list_entry(p, struct module, list);
char buf[8];
+ /* We always ignore unformed modules. */
+ if (mod->state == MODULE_STATE_UNFORMED)
+ return 0;
+
seq_printf(m, "%s %u",
mod->name, mod->init_size + mod->core_size);
print_unload_info(m, mod);
@@ -3510,6 +3680,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (mod->num_exentries == 0)
continue;
@@ -3558,10 +3730,13 @@ struct module *__module_address(unsigned long addr)
if (addr < module_addr_min || addr > module_addr_max)
return NULL;
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
if (within_module_core(addr, mod)
|| within_module_init(addr, mod))
return mod;
+ }
return NULL;
}
EXPORT_SYMBOL_GPL(__module_address);
@@ -3614,8 +3789,11 @@ void print_modules(void)
printk(KERN_DEFAULT "Modules linked in:");
/* Most callers should already have preempt disabled, but make sure */
preempt_disable();
- list_for_each_entry_rcu(mod, &modules, list)
+ list_for_each_entry_rcu(mod, &modules, list) {
+ if (mod->state == MODULE_STATE_UNFORMED)
+ continue;
printk(" %s%s", mod->name, module_flags(mod, buf));
+ }
preempt_enable();
if (last_unloaded_module[0])
printk(" [last unloaded: %s]", last_unloaded_module);
diff --git a/kernel/pid.c b/kernel/pid.c
index 36aa02ff17d6..de9af600006f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -270,7 +270,6 @@ void free_pid(struct pid *pid)
wake_up_process(ns->child_reaper);
break;
case 0:
- ns->nr_hashed = -1;
schedule_work(&ns->proc_work);
break;
}
@@ -319,7 +318,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock);
- if (ns->nr_hashed < 0)
+ if (!(ns->nr_hashed & PIDNS_HASH_ADDING))
goto out_unlock;
for ( ; upid >= pid->numbers; --upid) {
hlist_add_head_rcu(&upid->pid_chain,
@@ -342,6 +341,13 @@ out_free:
goto out;
}
+void disable_pid_allocation(struct pid_namespace *ns)
+{
+ spin_lock_irq(&pidmap_lock);
+ ns->nr_hashed &= ~PIDNS_HASH_ADDING;
+ spin_unlock_irq(&pidmap_lock);
+}
+
struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
{
struct hlist_node *elem;
@@ -573,6 +579,9 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
+ /* Veryify no one has done anything silly */
+ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
+
/* bump default and minimum pid_max based on number of cpus */
pid_max = min(pid_max_max, max_t(int, pid_max,
PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
@@ -584,7 +593,7 @@ void __init pidmap_init(void)
/* Reserve PID 0. We never call free_pidmap(0) */
set_bit(0, init_pid_ns.pidmap[0].page);
atomic_dec(&init_pid_ns.pidmap[0].nr_free);
- init_pid_ns.nr_hashed = 1;
+ init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
init_pid_ns.pid_cachep = KMEM_CACHE(pid,
SLAB_HWCACHE_ALIGN | SLAB_PANIC);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 560da0dab230..c1c3dc1c6023 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -115,6 +115,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
+ ns->nr_hashed = PIDNS_HASH_ADDING;
INIT_WORK(&ns->proc_work, proc_cleanup_work);
set_bit(0, ns->pidmap[0].page);
@@ -181,6 +182,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int rc;
struct task_struct *task, *me = current;
+ /* Don't allow any more processes into the pid namespace */
+ disable_pid_allocation(pid_ns);
+
/* Ignore SIGCHLD causing any terminated children to autoreap */
spin_lock_irq(&me->sighand->siglock);
me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
@@ -325,7 +329,8 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns)
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *ancestor, *new = ns;
- if (!ns_capable(new->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
+ !nsown_capable(CAP_SYS_ADMIN))
return -EPERM;
/*
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d73840271dce..a278cad1d5d6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -9,6 +9,7 @@
#include <asm/uaccess.h>
#include <linux/kernel_stat.h>
#include <trace/events/timer.h>
+#include <linux/random.h>
/*
* Called after updating RLIMIT_CPU to run cpu timer and update
@@ -470,6 +471,8 @@ static void cleanup_timers(struct list_head *head,
*/
void posix_cpu_timers_exit(struct task_struct *tsk)
{
+ add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+ sizeof(unsigned long long));
cleanup_timers(tsk->cpu_timers,
tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
diff --git a/kernel/printk.c b/kernel/printk.c
index 19c0d7bcf24a..357f714ddd49 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -870,10 +870,11 @@ static size_t print_time(u64 ts, char *buf)
if (!printk_time)
return 0;
+ rem_nsec = do_div(ts, 1000000000);
+
if (!buf)
- return 15;
+ return snprintf(NULL, 0, "[%5lu.000000] ", (unsigned long)ts);
- rem_nsec = do_div(ts, 1000000000);
return sprintf(buf, "[%5lu.%06lu] ",
(unsigned long)ts, rem_nsec / 1000);
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 1599157336a6..6cbeaae4406d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -117,11 +117,45 @@ void __ptrace_unlink(struct task_struct *child)
* TASK_KILLABLE sleeps.
*/
if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child))
- signal_wake_up(child, task_is_traced(child));
+ ptrace_signal_wake_up(child, true);
spin_unlock(&child->sighand->siglock);
}
+/* Ensure that nothing can wake it up, even SIGKILL */
+static bool ptrace_freeze_traced(struct task_struct *task)
+{
+ bool ret = false;
+
+ /* Lockless, nobody but us can set this flag */
+ if (task->jobctl & JOBCTL_LISTENING)
+ return ret;
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ task->state = __TASK_TRACED;
+ ret = true;
+ }
+ spin_unlock_irq(&task->sighand->siglock);
+
+ return ret;
+}
+
+static void ptrace_unfreeze_traced(struct task_struct *task)
+{
+ if (task->state != __TASK_TRACED)
+ return;
+
+ WARN_ON(!task->ptrace || task->parent != current);
+
+ spin_lock_irq(&task->sighand->siglock);
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ spin_unlock_irq(&task->sighand->siglock);
+}
+
/**
* ptrace_check_attach - check whether ptracee is ready for ptrace operation
* @child: ptracee to check for
@@ -139,7 +173,7 @@ void __ptrace_unlink(struct task_struct *child)
* RETURNS:
* 0 on success, -ESRCH if %child is not ready.
*/
-int ptrace_check_attach(struct task_struct *child, bool ignore_state)
+static int ptrace_check_attach(struct task_struct *child, bool ignore_state)
{
int ret = -ESRCH;
@@ -151,24 +185,29 @@ int ptrace_check_attach(struct task_struct *child, bool ignore_state)
* be changed by us so it's not changing right after this.
*/
read_lock(&tasklist_lock);
- if ((child->ptrace & PT_PTRACED) && child->parent == current) {
+ if (child->ptrace && child->parent == current) {
+ WARN_ON(child->state == __TASK_TRACED);
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
- spin_lock_irq(&child->sighand->siglock);
- WARN_ON_ONCE(task_is_stopped(child));
- if (ignore_state || (task_is_traced(child) &&
- !(child->jobctl & JOBCTL_LISTENING)))
+ if (ignore_state || ptrace_freeze_traced(child))
ret = 0;
- spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
- if (!ret && !ignore_state)
- ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
+ if (!ret && !ignore_state) {
+ if (!wait_task_inactive(child, __TASK_TRACED)) {
+ /*
+ * This can only happen if may_ptrace_stop() fails and
+ * ptrace_stop() changes ->state back to TASK_RUNNING,
+ * so we should not worry about leaking __TASK_TRACED.
+ */
+ WARN_ON(child->state == __TASK_TRACED);
+ ret = -ESRCH;
+ }
+ }
- /* All systems go.. */
return ret;
}
@@ -317,7 +356,7 @@ static int ptrace_attach(struct task_struct *task, long request,
*/
if (task_is_stopped(task) &&
task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING))
- signal_wake_up(task, 1);
+ signal_wake_up_state(task, __TASK_STOPPED);
spin_unlock(&task->sighand->siglock);
@@ -737,7 +776,7 @@ int ptrace_request(struct task_struct *child, long request,
* tracee into STOP.
*/
if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP)))
- signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
+ ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING);
unlock_task_sighand(child, &flags);
ret = 0;
@@ -763,7 +802,7 @@ int ptrace_request(struct task_struct *child, long request,
* start of this trap and now. Trigger re-trap.
*/
if (child->jobctl & JOBCTL_TRAP_NOTIFY)
- signal_wake_up(child, true);
+ ptrace_signal_wake_up(child, true);
ret = 0;
}
unlock_task_sighand(child, &flags);
@@ -900,6 +939,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
goto out_put_task_struct;
ret = arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
out_put_task_struct:
put_task_struct(child);
@@ -1039,8 +1080,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
ret = ptrace_check_attach(child, request == PTRACE_KILL ||
request == PTRACE_INTERRUPT);
- if (!ret)
+ if (!ret) {
ret = compat_arch_ptrace(child, request, addr, data);
+ if (ret || request != PTRACE_DETACH)
+ ptrace_unfreeze_traced(child);
+ }
out_put_task_struct:
put_task_struct(child);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 3920d593e63c..ff55247e7049 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -86,33 +86,39 @@ int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
return __res_counter_charge(counter, val, limit_fail_at, true);
}
-void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
{
if (WARN_ON(counter->usage < val))
val = counter->usage;
counter->usage -= val;
+ return counter->usage;
}
-void res_counter_uncharge_until(struct res_counter *counter,
- struct res_counter *top,
- unsigned long val)
+u64 res_counter_uncharge_until(struct res_counter *counter,
+ struct res_counter *top,
+ unsigned long val)
{
unsigned long flags;
struct res_counter *c;
+ u64 ret = 0;
local_irq_save(flags);
for (c = counter; c != top; c = c->parent) {
+ u64 r;
spin_lock(&c->lock);
- res_counter_uncharge_locked(c, val);
+ r = res_counter_uncharge_locked(c, val);
+ if (c == counter)
+ ret = r;
spin_unlock(&c->lock);
}
local_irq_restore(flags);
+ return ret;
}
-void res_counter_uncharge(struct res_counter *counter, unsigned long val)
+u64 res_counter_uncharge(struct res_counter *counter, unsigned long val)
{
- res_counter_uncharge_until(counter, NULL, val);
+ return res_counter_uncharge_until(counter, NULL, val);
}
static inline unsigned long long *
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 6850f53e02d8..b3c6c3fcd847 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int subclass)
EXPORT_SYMBOL(down_read_nested);
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+ might_sleep();
+ rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
+
+ LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+}
+
+EXPORT_SYMBOL(_down_write_nest_lock);
+
void down_write_nested(struct rw_semaphore *sem, int subclass)
{
might_sleep();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6737f4fb63b..bfe8ae22f710 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1523,7 +1523,8 @@ out:
*/
int wake_up_process(struct task_struct *p)
{
- return try_to_wake_up(p, TASK_ALL, 0);
+ WARN_ON(task_is_stopped_or_traced(p));
+ return try_to_wake_up(p, TASK_NORMAL, 0);
}
EXPORT_SYMBOL(wake_up_process);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4603d6cb9e25..5eea8707234a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -793,8 +793,11 @@ unsigned int sysctl_numa_balancing_scan_delay = 1000;
static void task_numa_placement(struct task_struct *p)
{
- int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
+ int seq;
+ if (!p->mm) /* for example, ksmd faulting in a user's mm */
+ return;
+ seq = ACCESS_ONCE(p->mm->numa_scan_seq);
if (p->numa_scan_seq == seq)
return;
p->numa_scan_seq = seq;
diff --git a/kernel/signal.c b/kernel/signal.c
index 580a91e63471..3d09cf6cde75 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -31,6 +31,7 @@
#include <linux/nsproxy.h>
#include <linux/user_namespace.h>
#include <linux/uprobes.h>
+#include <linux/compat.h>
#define CREATE_TRACE_POINTS
#include <trace/events/signal.h>
@@ -679,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* No need to set need_resched since signal event passing
* goes through ->blocked
*/
-void signal_wake_up(struct task_struct *t, int resume)
+void signal_wake_up_state(struct task_struct *t, unsigned int state)
{
- unsigned int mask;
-
set_tsk_thread_flag(t, TIF_SIGPENDING);
-
/*
- * For SIGKILL, we want to wake it up in the stopped/traced/killable
+ * TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
- mask = TASK_INTERRUPTIBLE;
- if (resume)
- mask |= TASK_WAKEKILL;
- if (!wake_up_state(t, mask))
+ if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
@@ -843,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t)
assert_spin_locked(&t->sighand->siglock);
task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY);
- signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
+ ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING);
}
/*
@@ -1799,6 +1794,10 @@ static inline int may_ptrace_stop(void)
* If SIGKILL was already sent before the caller unlocked
* ->siglock we must see ->core_state != NULL. Otherwise it
* is safe to enter schedule().
+ *
+ * This is almost outdated, a task with the pending SIGKILL can't
+ * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported
+ * after SIGKILL was already dequeued.
*/
if (unlikely(current->mm->core_state) &&
unlikely(current->mm == current->parent->mm))
@@ -1924,6 +1923,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
if (gstop_done)
do_notify_parent_cldstop(current, false, why);
+ /* tasklist protects us from ptrace_freeze_traced() */
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -2527,11 +2527,8 @@ static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
*/
void set_current_blocked(sigset_t *newset)
{
- struct task_struct *tsk = current;
sigdelsetmask(newset, sigmask(SIGKILL) | sigmask(SIGSTOP));
- spin_lock_irq(&tsk->sighand->siglock);
- __set_task_blocked(tsk, newset);
- spin_unlock_irq(&tsk->sighand->siglock);
+ __set_current_blocked(newset);
}
void __set_current_blocked(const sigset_t *newset)
@@ -3094,6 +3091,80 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
out:
return error;
}
+#ifdef CONFIG_GENERIC_SIGALTSTACK
+SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
+{
+ return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+}
+#endif
+
+int restore_altstack(const stack_t __user *uss)
+{
+ int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ /* squash all but EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __save_altstack(stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user((void __user *)t->sas_ss_sp, &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+
+#ifdef CONFIG_COMPAT
+#ifdef CONFIG_GENERIC_SIGALTSTACK
+COMPAT_SYSCALL_DEFINE2(sigaltstack,
+ const compat_stack_t __user *, uss_ptr,
+ compat_stack_t __user *, uoss_ptr)
+{
+ stack_t uss, uoss;
+ int ret;
+ mm_segment_t seg;
+
+ if (uss_ptr) {
+ compat_stack_t uss32;
+
+ memset(&uss, 0, sizeof(stack_t));
+ if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
+ return -EFAULT;
+ uss.ss_sp = compat_ptr(uss32.ss_sp);
+ uss.ss_flags = uss32.ss_flags;
+ uss.ss_size = uss32.ss_size;
+ }
+ seg = get_fs();
+ set_fs(KERNEL_DS);
+ ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
+ (stack_t __force __user *) &uoss,
+ compat_user_stack_pointer());
+ set_fs(seg);
+ if (ret >= 0 && uoss_ptr) {
+ if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
+ __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
+ __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
+ __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ ret = -EFAULT;
+ }
+ return ret;
+}
+
+int compat_restore_altstack(const compat_stack_t __user *uss)
+{
+ int err = compat_sys_sigaltstack(uss, NULL);
+ /* squash all but -EFAULT for now */
+ return err == -EFAULT ? err : 0;
+}
+
+int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
+{
+ struct task_struct *t = current;
+ return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
+ __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ __put_user(t->sas_ss_size, &uss->ss_size);
+}
+#endif
+#endif
#ifdef __ARCH_WANT_SYS_SIGPENDING
@@ -3130,7 +3201,6 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
if (nset) {
if (copy_from_user(&new_set, nset, sizeof(*nset)))
return -EFAULT;
- new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
new_blocked = current->blocked;
@@ -3148,7 +3218,7 @@ SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
return -EINVAL;
}
- __set_current_blocked(&new_blocked);
+ set_current_blocked(&new_blocked);
}
if (oset) {
@@ -3212,6 +3282,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask)
int old = current->blocked.sig[0];
sigset_t newset;
+ siginitset(&newset, newmask);
set_current_blocked(&newset);
return old;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index dbff751e4086..395084d4ce16 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
cond_syscall(sys_init_module);
+cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
cond_syscall(sys_socketpair);
cond_syscall(sys_bind);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 3ffe4c5ad3f3..41473b4ad7a4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3998,7 +3998,7 @@ static int ftrace_module_notify(struct notifier_block *self,
struct notifier_block ftrace_module_nb = {
.notifier_call = ftrace_module_notify,
- .priority = 0,
+ .priority = INT_MAX, /* Run before anything that can use kprobes */
};
extern unsigned long __start_mcount_loc[];
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 61e081b4ba11..3c13e46d7d24 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2899,6 +2899,8 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
+ buf[cnt] = 0;
+
trace_set_options(buf);
*ppos += cnt;
@@ -3034,6 +3036,31 @@ static void set_buffer_entries(struct trace_array *tr, unsigned long val)
tr->data[cpu]->entries = val;
}
+/* resize @tr's buffer to the size of @size_tr's entries */
+static int resize_buffer_duplicate_size(struct trace_array *tr,
+ struct trace_array *size_tr, int cpu_id)
+{
+ int cpu, ret = 0;
+
+ if (cpu_id == RING_BUFFER_ALL_CPUS) {
+ for_each_tracing_cpu(cpu) {
+ ret = ring_buffer_resize(tr->buffer,
+ size_tr->data[cpu]->entries, cpu);
+ if (ret < 0)
+ break;
+ tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+ }
+ } else {
+ ret = ring_buffer_resize(tr->buffer,
+ size_tr->data[cpu_id]->entries, cpu_id);
+ if (ret == 0)
+ tr->data[cpu_id]->entries =
+ size_tr->data[cpu_id]->entries;
+ }
+
+ return ret;
+}
+
static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
{
int ret;
@@ -3058,23 +3085,8 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
ret = ring_buffer_resize(max_tr.buffer, size, cpu);
if (ret < 0) {
- int r = 0;
-
- if (cpu == RING_BUFFER_ALL_CPUS) {
- int i;
- for_each_tracing_cpu(i) {
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.data[i]->entries,
- i);
- if (r < 0)
- break;
- }
- } else {
- r = ring_buffer_resize(global_trace.buffer,
- global_trace.data[cpu]->entries,
- cpu);
- }
-
+ int r = resize_buffer_duplicate_size(&global_trace,
+ &global_trace, cpu);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -3212,17 +3224,11 @@ static int tracing_set_tracer(const char *buf)
topts = create_trace_option_files(t);
if (t->use_max_tr) {
- int cpu;
/* we need to make per cpu buffer sizes equivalent */
- for_each_tracing_cpu(cpu) {
- ret = ring_buffer_resize(max_tr.buffer,
- global_trace.data[cpu]->entries,
- cpu);
- if (ret < 0)
- goto out;
- max_tr.data[cpu]->entries =
- global_trace.data[cpu]->entries;
- }
+ ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
+ RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ goto out;
}
if (t->init) {
@@ -3448,7 +3454,7 @@ static int tracing_wait_pipe(struct file *filp)
return -EINTR;
/*
- * We block until we read something and tracing is enabled.
+ * We block until we read something and tracing is disabled.
* We still block if tracing is disabled, but we have never
* read anything. This allows a user to cat this file, and
* then enable tracing. But after we have read something,
@@ -3456,7 +3462,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (tracing_is_enabled() && iter->pos)
+ if (!tracing_is_enabled() && iter->pos)
break;
}
@@ -4271,13 +4277,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -ENOMEM;
if (*ppos & (PAGE_SIZE - 1)) {
- WARN_ONCE(1, "Ftrace: previous read must page-align\n");
ret = -EINVAL;
goto out;
}
if (len & (PAGE_SIZE - 1)) {
- WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
if (len < PAGE_SIZE) {
ret = -EINVAL;
goto out;
@@ -4813,10 +4817,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
return ret;
if (buffer) {
- if (val)
+ mutex_lock(&trace_types_lock);
+ if (val) {
ring_buffer_record_on(buffer);
- else
+ if (current_trace->start)
+ current_trace->start(tr);
+ } else {
ring_buffer_record_off(buffer);
+ if (current_trace->stop)
+ current_trace->stop(tr);
+ }
+ mutex_unlock(&trace_types_lock);
}
(*ppos)++;
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 0c1b165778e5..42ca822fc701 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -33,7 +33,6 @@ static unsigned long max_stack_size;
static arch_spinlock_t max_stack_lock =
(arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
-static int stack_trace_disabled __read_mostly;
static DEFINE_PER_CPU(int, trace_active);
static DEFINE_MUTEX(stack_sysctl_mutex);
@@ -116,9 +115,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
{
int cpu;
- if (unlikely(!ftrace_enabled || stack_trace_disabled))
- return;
-
preempt_disable_notrace();
cpu = raw_smp_processor_id();
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f5975ccf9348..2b042c42fbc4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -799,7 +799,7 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
if (user_ns == current_user_ns())
return -EINVAL;
- /* Threaded many not enter a different user namespace */
+ /* Threaded processes may not enter a different user namespace */
if (atomic_read(&current->mm->mm_users) > 1)
return -EINVAL;
diff --git a/kernel/utsname.c b/kernel/utsname.c
index f6336d51d64c..08b197e8c485 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -113,7 +113,8 @@ static int utsns_install(struct nsproxy *nsproxy, void *new)
{
struct uts_namespace *ns = new;
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !nsown_capable(CAP_SYS_ADMIN))
return -EPERM;
get_uts_ns(ns);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 997c6a16ec22..75a2ab3d0b02 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -344,6 +344,10 @@ static void watchdog_enable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
+ /* kick off the timer for the hardlockup detector */
+ hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hrtimer->function = watchdog_timer_fn;
+
if (!watchdog_enabled) {
kthread_park(current);
return;
@@ -352,10 +356,6 @@ static void watchdog_enable(unsigned int cpu)
/* Enable the perf event */
watchdog_nmi_enable(cpu);
- /* kick off the timer for the hardlockup detector */
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hrtimer->function = watchdog_timer_fn;
-
/* done here because hrtimer_start can only pin to smp_processor_id() */
hrtimer_start(hrtimer, ns_to_ktime(sample_period),
HRTIMER_MODE_REL_PINNED);
@@ -369,9 +369,6 @@ static void watchdog_disable(unsigned int cpu)
{
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- if (!watchdog_enabled)
- return;
-
watchdog_set_prio(SCHED_NORMAL, 0);
hrtimer_cancel(hrtimer);
/* disable the perf event */