From 991f76f837bf22c5bb07261cfd86525a0a96650c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:24 +0800 Subject: sysfs: fix race between readdir and lseek While readdir() is running, lseek() may set filp->f_pos as zero, then may leave filp->private_data pointing to one sysfs_dirent object without holding its reference counter, so the sysfs_dirent object may be used after free in next readdir(). This patch holds inode->i_mutex to avoid the problem since the lock is always held in readdir path. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/sysfs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..c9e16608f486 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1058,10 +1058,21 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } +static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file_inode(file); + loff_t ret; + + mutex_lock(&inode->i_mutex); + ret = generic_file_llseek(file, offset, whence); + mutex_unlock(&inode->i_mutex); + + return ret; +} const struct file_operations sysfs_dir_operations = { .read = generic_read_dir, .readdir = sysfs_readdir, .release = sysfs_dir_release, - .llseek = generic_file_llseek, + .llseek = sysfs_dir_llseek, }; -- cgit v1.2.3 From e5110f411d2ee35bf8d202ccca2e89c633060dca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 20 Mar 2013 23:25:25 +0800 Subject: sysfs: handle failure path correctly for readdir() In case of 'if (filp->f_pos == 0 or 1)' of sysfs_readdir(), the failure from filldir() isn't handled, and the reference counter of the sysfs_dirent object pointed by filp->private_data will be released without clearing filp->private_data, so use after free bug will be triggered later. This patch returns immeadiately under the situation for fixing the bug, and it is reasonable to return from readdir() when filldir() fails. Reported-by: Dave Jones Tested-by: Sasha Levin Cc: Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/sysfs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c9e16608f486..e14512678c9b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -1020,6 +1020,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } if (filp->f_pos == 1) { if (parent_sd->s_parent) @@ -1028,6 +1030,8 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) ino = parent_sd->s_ino; if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) == 0) filp->f_pos++; + else + return 0; } mutex_lock(&sysfs_mutex); for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); -- cgit v1.2.3 From 3db3c62584fbafee52a068035cc4c57e7b921acf Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Fri, 8 Mar 2013 16:07:27 +0100 Subject: sysfs: use atomic_inc_unless_negative in sysfs_get_active It seems that sysfs has an interesting way of doing the same thing. This removes the cpu_relax unfortunately, but if it's really needed, it would be better to add this to include/linux/atomic.h to benefit all atomic ops users. Signed-off-by: Maarten Lankhorst Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'fs/sysfs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2fbdff6be25c..7f968ede20d6 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -165,21 +165,8 @@ struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return NULL; - while (1) { - int v, t; - - v = atomic_read(&sd->s_active); - if (unlikely(v < 0)) - return NULL; - - t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) - break; - if (t < 0) - return NULL; - - cpu_relax(); - } + if (!atomic_inc_unless_negative(&sd->s_active)) + return NULL; if (likely(!ignore_lockdep(sd))) rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); -- cgit v1.2.3 From 87a8ebd637dafc255070f503909a053cf0d98d3f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sun, 24 Mar 2013 14:28:27 -0700 Subject: userns: Restrict when proc and sysfs can be mounted Only allow unprivileged mounts of proc and sysfs if they are already mounted when the user namespace is created. proc and sysfs are interesting because they have content that is per namespace, and so fresh mounts are needed when new namespaces are created while at the same time proc and sysfs have content that is shared between every instance. Respect the policy of who may see the shared content of proc and sysfs by only allowing new mounts if there was an existing mount at the time the user namespace was created. In practice there are only two interesting cases: proc and sysfs are mounted at their usual places, proc and sysfs are not mounted at all (some form of mount namespace jail). Cc: stable@vger.kernel.org Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 21 +++++++++++++++++++++ fs/proc/root.c | 4 ++++ fs/sysfs/mount.c | 4 ++++ include/linux/user_namespace.h | 4 ++++ kernel/user.c | 2 ++ kernel/user_namespace.c | 2 ++ 6 files changed, 37 insertions(+) (limited to 'fs/sysfs') diff --git a/fs/namespace.c b/fs/namespace.c index 968d4c5eae03..d581e45c0a9f 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2763,6 +2763,27 @@ bool current_chrooted(void) return chrooted; } +void update_mnt_policy(struct user_namespace *userns) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + switch (mnt->mnt.mnt_sb->s_magic) { + case SYSFS_MAGIC: + userns->may_mount_sysfs = true; + break; + case PROC_SUPER_MAGIC: + userns->may_mount_proc = true; + break; + } + if (userns->may_mount_sysfs && userns->may_mount_proc) + break; + } + up_read(&namespace_sem); +} + static void *mntns_get(struct task_struct *task) { struct mnt_namespace *ns = NULL; diff --git a/fs/proc/root.c b/fs/proc/root.c index c6e9fac26bac..9c7fab1d23f0 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -108,6 +109,9 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, } else { ns = task_active_pid_ns(current); options = data; + + if (!current_user_ns()->may_mount_proc) + return ERR_PTR(-EPERM); } sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns); diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8d924b5ec733..afd83273e6ce 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "sysfs.h" @@ -111,6 +112,9 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; + if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + return ERR_PTR(-EPERM); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return ERR_PTR(-ENOMEM); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4ce009324933..b6b215f13b45 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,6 +26,8 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + bool may_mount_sysfs; + bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -82,4 +84,6 @@ static inline void put_user_ns(struct user_namespace *ns) #endif +void update_mnt_policy(struct user_namespace *userns); + #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..8e635a18ab52 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 0f1e42884577..a54f26f82eb2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -96,6 +96,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } -- cgit v1.2.3 From f7db5e7660b122142410dcf36ba903c73d473250 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 2 Apr 2013 10:12:26 +0800 Subject: sysfs: fix use after free in case of concurrent read/write and readdir The inode->i_mutex isn't hold when updating filp->f_pos in read()/write(), so the filp->f_pos might be read as 0 or 1 in readdir() when there is concurrent read()/write() on this same file, then may cause use after free in readdir(). The bug can be reproduced with Li Zefan's test code on the link: https://patchwork.kernel.org/patch/2160771/ This patch fixes the use after free under this situation. Cc: stable Reported-by: Li Zefan Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs/sysfs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index c6f54abe9852..1bf016b5e88f 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -999,6 +999,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) enum kobj_ns_type type; const void *ns; ino_t ino; + loff_t off; type = sysfs_ns_type(parent_sd); ns = sysfs_info(dentry->d_sb)->ns[type]; @@ -1021,6 +1022,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; } mutex_lock(&sysfs_mutex); + off = filp->f_pos; for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos); pos; pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) { @@ -1032,19 +1034,24 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) len = strlen(name); ino = pos->s_ino; type = dt_type(pos); - filp->f_pos = pos->s_hash; + off = filp->f_pos = pos->s_hash; filp->private_data = sysfs_get(pos); mutex_unlock(&sysfs_mutex); - ret = filldir(dirent, name, len, filp->f_pos, ino, type); + ret = filldir(dirent, name, len, off, ino, type); mutex_lock(&sysfs_mutex); if (ret < 0) break; } mutex_unlock(&sysfs_mutex); - if ((filp->f_pos > 1) && !pos) { /* EOF */ - filp->f_pos = INT_MAX; + + /* don't reference last entry if its refcount is dropped */ + if (!pos) { filp->private_data = NULL; + + /* EOF and not changed as 0 or 1 in read/write path */ + if (off == filp->f_pos && off > 1) + filp->f_pos = INT_MAX; } return 0; } -- cgit v1.2.3 From bb2b0051d7b0772ea9d0b4be900c2d965093f5d7 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 4 Apr 2013 22:22:37 +0800 Subject: sysfs: check if one entry has been removed before freeing It might be a kernel disaster if one sysfs entry is freed but still referenced by sysfs tree. Recently Dave and Sasha reported one use-after-free problem on sysfs entry, and the problem has been troubleshooted with help of debug message added in this patch. Given sysfs_get_dirent/sysfs_put are exported APIs, even inside sysfs they are called in many contexts(kobject/attribe add/delete, inode init/drop, dentry lookup/release, readdir, ...), it is healthful to check the removed flag before freeing one entry and dump message if it is freeing without being removed first. Cc: Dave Jones Cc: Sasha Levin Signed-off-by: Ming Lei Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs/sysfs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 1bf016b5e88f..e8e0e71b29d5 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -268,6 +268,10 @@ void release_sysfs_dirent(struct sysfs_dirent * sd) */ parent_sd = sd->s_parent; + WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED), + "sysfs: free using entry: %s/%s\n", + parent_sd ? parent_sd->s_name : "", sd->s_name); + if (sysfs_type(sd) == SYSFS_KOBJ_LINK) sysfs_put(sd->s_symlink.target_sd); if (sysfs_type(sd) & SYSFS_COPY_NAME) @@ -386,7 +390,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) sd->s_name = name; sd->s_mode = mode; - sd->s_flags = type; + sd->s_flags = type | SYSFS_FLAG_REMOVED; return sd; @@ -466,6 +470,9 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd) ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME; } + /* Mark the entry added into directory tree */ + sd->s_flags &= ~SYSFS_FLAG_REMOVED; + return 0; } -- cgit v1.2.3