From 002345925e6c45861f60db6f4fc6236713fd8847 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:36:43 -0800 Subject: syslog: distinguish between /proc/kmsg and syscalls This allows the LSM to distinguish between syslog functions originating from /proc/kmsg access and direct syscalls. By default, the commoncaps will now no longer require CAP_SYS_ADMIN to read an opened /proc/kmsg file descriptor. For example the kernel syslog reader can now drop privileges after opening /proc/kmsg, instead of staying privileged with CAP_SYS_ADMIN. MAC systems that implement security_syslog have unchanged behavior. Signed-off-by: Kees Cook Acked-by: Serge Hallyn Acked-by: John Johansen Signed-off-by: James Morris --- fs/proc/kmsg.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 7ca78346d3f0..6a3d843a1088 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -12,37 +12,37 @@ #include #include #include +#include #include #include extern wait_queue_head_t log_wait; -extern int do_syslog(int type, char __user *bug, int count); - static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1,NULL,0); + return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0,NULL,0); + (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); return 0; } static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - if ((file->f_flags & O_NONBLOCK) && !do_syslog(9, NULL, 0)) + if ((file->f_flags & O_NONBLOCK) && + !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count); + return do_syslog(2, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0)) + if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } -- cgit v1.2.3 From d78ca3cd733d8a2c3dcd88471beb1a15d973eed8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:37:13 -0800 Subject: syslog: use defined constants instead of raw numbers Right now the syslog "type" action are just raw numbers which makes the source difficult to follow. This patch replaces the raw numbers with defined constants for some level of sanity. Signed-off-by: Kees Cook Acked-by: John Johansen Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/proc/kmsg.c | 10 +++++----- include/linux/syslog.h | 23 +++++++++++++++++++++++ kernel/printk.c | 45 +++++++++++++++++++-------------------------- security/commoncap.c | 5 +++-- security/selinux/hooks.c | 21 +++++++++++---------- 5 files changed, 61 insertions(+), 43 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 6a3d843a1088..cfe90a48a6e8 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(1, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(0, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return -EAGAIN; - return do_syslog(2, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(9, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) return POLLIN | POLLRDNORM; return 0; } diff --git a/include/linux/syslog.h b/include/linux/syslog.h index 5f02b1817be1..38911391a139 100644 --- a/include/linux/syslog.h +++ b/include/linux/syslog.h @@ -21,6 +21,29 @@ #ifndef _LINUX_SYSLOG_H #define _LINUX_SYSLOG_H +/* Close the log. Currently a NOP. */ +#define SYSLOG_ACTION_CLOSE 0 +/* Open the log. Currently a NOP. */ +#define SYSLOG_ACTION_OPEN 1 +/* Read from the log. */ +#define SYSLOG_ACTION_READ 2 +/* Read all messages remaining in the ring buffer. */ +#define SYSLOG_ACTION_READ_ALL 3 +/* Read and clear all messages remaining in the ring buffer */ +#define SYSLOG_ACTION_READ_CLEAR 4 +/* Clear ring buffer. */ +#define SYSLOG_ACTION_CLEAR 5 +/* Disable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_OFF 6 +/* Enable printk's to console */ +#define SYSLOG_ACTION_CONSOLE_ON 7 +/* Set level of messages printed to console */ +#define SYSLOG_ACTION_CONSOLE_LEVEL 8 +/* Return number of unread characters in the log buffer */ +#define SYSLOG_ACTION_SIZE_UNREAD 9 +/* Return size of the log buffer */ +#define SYSLOG_ACTION_SIZE_BUFFER 10 + #define SYSLOG_FROM_CALL 0 #define SYSLOG_FROM_FILE 1 diff --git a/kernel/printk.c b/kernel/printk.c index 809cf9a258a0..3e162d867098 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -259,21 +259,6 @@ static inline void boot_delay_msec(void) } #endif -/* - * Commands to do_syslog: - * - * 0 -- Close the log. Currently a NOP. - * 1 -- Open the log. Currently a NOP. - * 2 -- Read from the log. - * 3 -- Read all messages remaining in the ring buffer. - * 4 -- Read and clear all messages remaining in the ring buffer - * 5 -- Clear ring buffer. - * 6 -- Disable printk's to console - * 7 -- Enable printk's to console - * 8 -- Set level of messages printed to console - * 9 -- Return number of unread characters in the log buffer - * 10 -- Return size of the log buffer - */ int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -286,11 +271,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) return error; switch (type) { - case 0: /* Close log */ + case SYSLOG_ACTION_CLOSE: /* Close log */ break; - case 1: /* Open log */ + case SYSLOG_ACTION_OPEN: /* Open log */ break; - case 2: /* Read from log */ + case SYSLOG_ACTION_READ: /* Read from log */ error = -EINVAL; if (!buf || len < 0) goto out; @@ -321,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (!error) error = i; break; - case 4: /* Read/clear last kernel messages */ + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: do_clear = 1; /* FALL THRU */ - case 3: /* Read last kernel messages */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: error = -EINVAL; if (!buf || len < 0) goto out; @@ -377,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } } break; - case 5: /* Clear ring buffer */ + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: logged_chars = 0; break; - case 6: /* Disable logging to console */ + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; - case 7: /* Enable logging to console */ + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != -1) { console_loglevel = saved_console_loglevel; saved_console_loglevel = -1; } break; - case 8: /* Set level of messages printed to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: error = -EINVAL; if (len < 1 || len > 8) goto out; @@ -402,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) saved_console_loglevel = -1; error = 0; break; - case 9: /* Number of chars in the log buffer */ + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: error = log_end - log_start; break; - case 10: /* Size of the log buffer */ + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: diff --git a/security/commoncap.c b/security/commoncap.c index 677fad9d5cba..cf01b2eebb60 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -897,9 +897,10 @@ error: int cap_syslog(int type, bool from_file) { /* /proc/kmsg can open be opened by CAP_SYS_ADMIN */ - if (type != 1 && from_file) + if (type != SYSLOG_ACTION_OPEN && from_file) return 0; - if ((type != 3 && type != 10) && !capable(CAP_SYS_ADMIN)) + if ((type != SYSLOG_ACTION_READ_ALL && + type != SYSLOG_ACTION_SIZE_BUFFER) && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index a4862a0730fa..6b36ce2eef2e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2059,20 +2059,21 @@ static int selinux_syslog(int type, bool from_file) return rc; switch (type) { - case 3: /* Read last kernel messages */ - case 10: /* Return size of the log buffer */ + case SYSLOG_ACTION_READ_ALL: /* Read last kernel messages */ + case SYSLOG_ACTION_SIZE_BUFFER: /* Return size of the log buffer */ rc = task_has_system(current, SYSTEM__SYSLOG_READ); break; - case 6: /* Disable logging to console */ - case 7: /* Enable logging to console */ - case 8: /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_OFF: /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: /* Enable logging to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: rc = task_has_system(current, SYSTEM__SYSLOG_CONSOLE); break; - case 0: /* Close log */ - case 1: /* Open log */ - case 2: /* Read from log */ - case 4: /* Read/clear last kernel messages */ - case 5: /* Clear ring buffer */ + case SYSLOG_ACTION_CLOSE: /* Close log */ + case SYSLOG_ACTION_OPEN: /* Open log */ + case SYSLOG_ACTION_READ: /* Read from log */ + case SYSLOG_ACTION_READ_CLEAR: /* Read/clear last kernel messages */ + case SYSLOG_ACTION_CLEAR: /* Clear ring buffer */ default: rc = task_has_system(current, SYSTEM__SYSLOG_MOD); break; -- cgit v1.2.3 From 8cfb3343f70bcf9403218df120ecf345f06dd585 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 1 Feb 2010 21:34:14 -0700 Subject: of: make set_node_proc_entry private to proc_devtree.c We only need set_node_proc_entry in proc_devtree.c, so move it there. This fixes the !HAVE_ARCH_DEVTREE_FIXUPS build, as we can't make make the definition in linux/of.h conditional on this #define (definitions in asm/prom.h can't be exposed to linux/of.h, due to the enforced #include ordering). Signed-off-by: Jeremy Kerr Signed-off-by: Grant Likely --- fs/proc/proc_devtree.c | 5 +++-- include/linux/of.h | 6 ------ 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 123257bb356b..2309bf17203f 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -14,12 +14,13 @@ #include #include "internal.h" -#ifndef HAVE_ARCH_DEVTREE_FIXUPS static inline void set_node_proc_entry(struct device_node *np, struct proc_dir_entry *de) { -} +#ifdef HAVE_ARCH_DEVTREE_FIXUPS + np->pde = de; #endif +} static struct proc_dir_entry *proc_device_tree; diff --git a/include/linux/of.h b/include/linux/of.h index 3cc0d7ae290e..fd47c81d7a25 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -73,12 +73,6 @@ static inline void of_node_set_flag(struct device_node *n, unsigned long flag) set_bit(flag, &n->_flags); } -static inline void -set_node_proc_entry(struct device_node *dn, struct proc_dir_entry *de) -{ - dn->pde = de; -} - extern struct device_node *of_find_all_nodes(struct device_node *prev); #if defined(CONFIG_SPARC) -- cgit v1.2.3 From 50ab2fe147e22c8786552cda1791a61ae81b84d2 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Mon, 1 Feb 2010 21:34:14 -0700 Subject: proc_devtree: include linux/of.h Currenly, proc_devtree.c depends on asm/prom.h to include linux/of.h, to provide some device-tree definitions (eg, struct property). Instead, include linux/of.h directly. We still need asm/prom.h for HAVE_ARCH_DEVTREE_FIXUPS. Signed-off-by: Jeremy Kerr Signed-off-by: Grant Likely --- fs/proc/proc_devtree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 2309bf17203f..0ec45110e15e 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include "internal.h" -- cgit v1.2.3 From 7c540d9e3da38c3d1c15fb8059e4577a84ac0066 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Sun, 14 Feb 2010 07:13:41 -0700 Subject: proc_devtree: fix THIS_MODULE without module.h Commit e22f628395432b967f2f505858c64450f7835365 introduced a build breakage for ARM devtree work: the THIS_MODULE macro was added, but we don't have module.h This change adds the necessary #include to get THIS_MODULE defined. While we could just replace it with NULL (PROC_FS is a bool, not a tristate), using THIS_MODULE will prevent unexpected breakage if we ever do compile this as a module. Signed-off-by: Jeremy Kerr Signed-off-by: Grant Likely Acked-by: Benjamin Herrenschmidt Acked-by: Michal Simek --- fs/proc/proc_devtree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/proc') diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 0ec45110e15e..f8650dce74fb 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include "internal.h" -- cgit v1.2.3 From 7fee4868be91e71a3ee8e57289ebf5e10a12297e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 14 Jan 2010 01:03:28 -0500 Subject: Switch proc/self to nd_set_link() Signed-off-by: Al Viro --- fs/proc/base.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/base.c b/fs/proc/base.c index e42bbd843ed1..58324c299165 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2369,16 +2369,30 @@ static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return ERR_PTR(-ENOENT); - sprintf(tmp, "%d", task_tgid_nr_ns(current, ns)); - return ERR_PTR(vfs_follow_link(nd,tmp)); + char *name = ERR_PTR(-ENOENT); + if (tgid) { + name = __getname(); + if (!name) + name = ERR_PTR(-ENOMEM); + else + sprintf(name, "%d", tgid); + } + nd_set_link(nd, name); + return NULL; +} + +static void proc_self_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + char *s = nd_get_link(nd); + if (!IS_ERR(s)) + __putname(s); } static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, .follow_link = proc_self_follow_link, + .put_link = proc_self_put_link, }; /* -- cgit v1.2.3 From 7dc52157982ab771f40e3c0b7dc55b954c3c2d19 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 22 Feb 2010 17:04:52 -0800 Subject: vfs: Apply lockdep-based checking to rcu_dereference() uses Add lockdep-ified RCU primitives to alloc_fd(), files_fdtable() and fcheck_files(). Cc: Alexander Viro Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Alexander Viro LKML-Reference: <1266887105-1528-8-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- fs/file.c | 2 +- fs/proc/array.c | 2 ++ fs/proc/base.c | 6 +++++- include/linux/fdtable.h | 8 ++++++-- 4 files changed, 14 insertions(+), 4 deletions(-) (limited to 'fs/proc') diff --git a/fs/file.c b/fs/file.c index 87e129030ab1..38039af67663 100644 --- a/fs/file.c +++ b/fs/file.c @@ -478,7 +478,7 @@ repeat: error = fd; #if 1 /* Sanity check */ - if (rcu_dereference(fdt->fd[fd]) != NULL) { + if (rcu_dereference_raw(fdt->fd[fd]) != NULL) { printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); rcu_assign_pointer(fdt->fd[fd], NULL); } diff --git a/fs/proc/array.c b/fs/proc/array.c index 13b5d0708175..18e20feee251 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -270,7 +270,9 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) blocked = p->blocked; collect_sigign_sigcatch(p, &ignored, &caught); num_threads = atomic_read(&p->signal->count); + rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); + rcu_read_unlock(); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } diff --git a/fs/proc/base.c b/fs/proc/base.c index 58324c299165..623e2ffb5d2b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1095,8 +1095,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) + rcu_read_lock(); + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) { + rcu_read_unlock(); return -EPERM; + } + rcu_read_unlock(); if (count >= PAGE_SIZE) count = PAGE_SIZE - 1; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index a2ec74bc4812..144412ffaced 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -57,7 +57,11 @@ struct files_struct { struct file * fd_array[NR_OPEN_DEFAULT]; }; -#define files_fdtable(files) (rcu_dereference((files)->fdt)) +#define files_fdtable(files) \ + (rcu_dereference_check((files)->fdt, \ + rcu_read_lock_held() || \ + lockdep_is_held(&(files)->file_lock) || \ + atomic_read(&files->count) == 1)) struct file_operations; struct vfsmount; @@ -78,7 +82,7 @@ static inline struct file * fcheck_files(struct files_struct *files, unsigned in struct fdtable *fdt = files_fdtable(files); if (fd < fdt->max_fds) - file = rcu_dereference(fdt->fd[fd]); + file = rcu_dereference_check(fdt->fd[fd], rcu_read_lock_held() || lockdep_is_held(&files->file_lock) || atomic_read(&files->count) == 1); return file; } -- cgit v1.2.3 From 587d4a17d837ac0f17edb26f1b6c80c0abda6343 Mon Sep 17 00:00:00 2001 From: "Helight.Xu" Date: Wed, 30 Dec 2009 13:24:41 +0800 Subject: some clean up in fs/proc EXPORT_SYMBOL(proc_symlink); EXPORT_SYMBOL(proc_mkdir); EXPORT_SYMBOL(create_proc_entry); EXPORT_SYMBOL(proc_create_data); EXPORT_SYMBOL(remove_proc_entry); Those EXPORT_SYMBOL shouldn't be in fs/proc/root.c, should be in fs/proc/generic.c. Signed-off-by: Helight.Xu Signed-off-by: Al Viro --- fs/proc/generic.c | 5 +++++ fs/proc/root.c | 6 ------ 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 480cb1065eec..9580abeadeb3 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -662,6 +662,7 @@ struct proc_dir_entry *proc_symlink(const char *name, } return ent; } +EXPORT_SYMBOL(proc_symlink); struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, struct proc_dir_entry *parent) @@ -700,6 +701,7 @@ struct proc_dir_entry *proc_mkdir(const char *name, { return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); } +EXPORT_SYMBOL(proc_mkdir); struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, struct proc_dir_entry *parent) @@ -728,6 +730,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, } return ent; } +EXPORT_SYMBOL(create_proc_entry); struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, struct proc_dir_entry *parent, @@ -762,6 +765,7 @@ out_free: out: return NULL; } +EXPORT_SYMBOL(proc_create_data); static void free_proc_entry(struct proc_dir_entry *de) { @@ -853,3 +857,4 @@ continue_removing: de->parent->name, de->name, de->subdir->name); pde_put(de); } +EXPORT_SYMBOL(remove_proc_entry); diff --git a/fs/proc/root.c b/fs/proc/root.c index b080b791d9e3..757c069f2a65 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -220,9 +220,3 @@ void pid_ns_release_proc(struct pid_namespace *ns) { mntput(ns->proc_mnt); } - -EXPORT_SYMBOL(proc_symlink); -EXPORT_SYMBOL(proc_mkdir); -EXPORT_SYMBOL(create_proc_entry); -EXPORT_SYMBOL(proc_create_data); -EXPORT_SYMBOL(remove_proc_entry); -- cgit v1.2.3 From 9f5596af44514f99e3a654a4f7cb813354b9e516 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 5 Feb 2010 00:40:25 -0500 Subject: take check for new events in namespace (guts of mounts_poll()) to namespace.c Signed-off-by: Al Viro --- fs/namespace.c | 15 +++++++++++++++ fs/proc/base.c | 10 ++-------- include/linux/mnt_namespace.h | 1 + 3 files changed, 18 insertions(+), 8 deletions(-) (limited to 'fs/proc') diff --git a/fs/namespace.c b/fs/namespace.c index d5906c19e08e..970fe79d7867 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -737,6 +737,21 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } +int mnt_had_events(struct proc_mounts *p) +{ + struct mnt_namespace *ns = p->ns; + int res = 0; + + spin_lock(&vfsmount_lock); + if (p->event != ns->event) { + p->event = ns->event; + res = 1; + } + spin_unlock(&vfsmount_lock); + + return res; +} + struct proc_fs_info { int flag; const char *str; diff --git a/fs/proc/base.c b/fs/proc/base.c index 58324c299165..746895ddfda1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -647,17 +647,11 @@ static int mounts_release(struct inode *inode, struct file *file) static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; - struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; - poll_wait(file, &ns->poll, wait); - - spin_lock(&vfsmount_lock); - if (p->event != ns->event) { - p->event = ns->event; + poll_wait(file, &p->ns->poll, wait); + if (mnt_had_events(p)) res |= POLLERR | POLLPRI; - } - spin_unlock(&vfsmount_lock); return res; } diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index d74785c2393a..0b89efc6f215 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -35,6 +35,7 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern const struct seq_operations mounts_op; extern const struct seq_operations mountinfo_op; extern const struct seq_operations mountstats_op; +extern int mnt_had_events(struct proc_mounts *); #endif #endif -- cgit v1.2.3 From d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:39 -0800 Subject: mm: clean up mm_counter Presently, per-mm statistics counter is defined by macro in sched.h This patch modifies it to - defined in mm.h as inlinf functions - use array instead of macro's name creation. This patch is for reducing patch size in future patch to modify implementation of per-mm counter. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +- include/linux/mm.h | 104 +++++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 33 ++++++++++----- include/linux/sched.h | 54 ------------------------ kernel/fork.c | 3 +- kernel/tsacct.c | 1 + mm/filemap_xip.c | 2 +- mm/fremap.c | 2 +- mm/memory.c | 56 +++++++++++++++---------- mm/oom_kill.c | 4 +- mm/rmap.c | 10 ++--- mm/swapfile.c | 2 +- 12 files changed, 174 insertions(+), 101 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f277c4a111cb..375581276011 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -65,11 +65,11 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - *shared = get_mm_counter(mm, file_rss); + *shared = get_mm_counter(mm, MM_FILEPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; *data = mm->total_vm - mm->shared_vm; - *resident = *shared + get_mm_counter(mm, anon_rss); + *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 90957f14195c..2124cdb2d1d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -870,6 +870,110 @@ extern int mprotect_fixup(struct vm_area_struct *vma, */ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +/* + * per-process(per-mm_struct) statistics. + */ +#if USE_SPLIT_PTLOCKS +/* + * The mm counters are not protected by its page_table_lock, + * so must be incremented atomically. + */ +static inline void set_mm_counter(struct mm_struct *mm, int member, long value) +{ + atomic_long_set(&mm->rss_stat.count[member], value); +} + +static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + return (unsigned long)atomic_long_read(&mm->rss_stat.count[member]); +} + +static inline void add_mm_counter(struct mm_struct *mm, int member, long value) +{ + atomic_long_add(value, &mm->rss_stat.count[member]); +} + +static inline void inc_mm_counter(struct mm_struct *mm, int member) +{ + atomic_long_inc(&mm->rss_stat.count[member]); +} + +static inline void dec_mm_counter(struct mm_struct *mm, int member) +{ + atomic_long_dec(&mm->rss_stat.count[member]); +} + +#else /* !USE_SPLIT_PTLOCKS */ +/* + * The mm counters are protected by its page_table_lock, + * so can be incremented directly. + */ +static inline void set_mm_counter(struct mm_struct *mm, int member, long value) +{ + mm->rss_stat.count[member] = value; +} + +static inline unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + return mm->rss_stat.count[member]; +} + +static inline void add_mm_counter(struct mm_struct *mm, int member, long value) +{ + mm->rss_stat.count[member] += value; +} + +static inline void inc_mm_counter(struct mm_struct *mm, int member) +{ + mm->rss_stat.count[member]++; +} + +static inline void dec_mm_counter(struct mm_struct *mm, int member) +{ + mm->rss_stat.count[member]--; +} + +#endif /* !USE_SPLIT_PTLOCKS */ + +static inline unsigned long get_mm_rss(struct mm_struct *mm) +{ + return get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_ANONPAGES); +} + +static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) +{ + return max(mm->hiwater_rss, get_mm_rss(mm)); +} + +static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) +{ + return max(mm->hiwater_vm, mm->total_vm); +} + +static inline void update_hiwater_rss(struct mm_struct *mm) +{ + unsigned long _rss = get_mm_rss(mm); + + if ((mm)->hiwater_rss < _rss) + (mm)->hiwater_rss = _rss; +} + +static inline void update_hiwater_vm(struct mm_struct *mm) +{ + if (mm->hiwater_vm < mm->total_vm) + mm->hiwater_vm = mm->total_vm; +} + +static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, + struct mm_struct *mm) +{ + unsigned long hiwater_rss = get_mm_hiwater_rss(mm); + + if (*maxrss < hiwater_rss) + *maxrss = hiwater_rss; +} + /* * A callback you can register to apply pressure to ageable caches. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36f96271306c..e1ca64be6678 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -24,12 +24,6 @@ struct address_space; #define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) -#if USE_SPLIT_PTLOCKS -typedef atomic_long_t mm_counter_t; -#else /* !USE_SPLIT_PTLOCKS */ -typedef unsigned long mm_counter_t; -#endif /* !USE_SPLIT_PTLOCKS */ - /* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the @@ -201,6 +195,22 @@ struct core_state { struct completion startup; }; +enum { + MM_FILEPAGES, + MM_ANONPAGES, + NR_MM_COUNTERS +}; + +#if USE_SPLIT_PTLOCKS +struct mm_rss_stat { + atomic_long_t count[NR_MM_COUNTERS]; +}; +#else /* !USE_SPLIT_PTLOCKS */ +struct mm_rss_stat { + unsigned long count[NR_MM_COUNTERS]; +}; +#endif /* !USE_SPLIT_PTLOCKS */ + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -227,11 +237,6 @@ struct mm_struct { * by mmlist_lock */ - /* Special counters, in some configurations protected by the - * page_table_lock, in other configurations by being atomic. - */ - mm_counter_t _file_rss; - mm_counter_t _anon_rss; unsigned long hiwater_rss; /* High-watermark of RSS usage */ unsigned long hiwater_vm; /* High-water virtual memory usage */ @@ -244,6 +249,12 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + /* + * Special counters, in some configurations protected by the + * page_table_lock, in other configurations by being atomic. + */ + struct mm_rss_stat rss_stat; + struct linux_binfmt *binfmt; cpumask_t cpu_vm_mask; diff --git a/include/linux/sched.h b/include/linux/sched.h index 4b1753f7e48e..cbeafa49a53b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -396,60 +396,6 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif -#if USE_SPLIT_PTLOCKS -/* - * The mm counters are not protected by its page_table_lock, - * so must be incremented atomically. - */ -#define set_mm_counter(mm, member, value) atomic_long_set(&(mm)->_##member, value) -#define get_mm_counter(mm, member) ((unsigned long)atomic_long_read(&(mm)->_##member)) -#define add_mm_counter(mm, member, value) atomic_long_add(value, &(mm)->_##member) -#define inc_mm_counter(mm, member) atomic_long_inc(&(mm)->_##member) -#define dec_mm_counter(mm, member) atomic_long_dec(&(mm)->_##member) - -#else /* !USE_SPLIT_PTLOCKS */ -/* - * The mm counters are protected by its page_table_lock, - * so can be incremented directly. - */ -#define set_mm_counter(mm, member, value) (mm)->_##member = (value) -#define get_mm_counter(mm, member) ((mm)->_##member) -#define add_mm_counter(mm, member, value) (mm)->_##member += (value) -#define inc_mm_counter(mm, member) (mm)->_##member++ -#define dec_mm_counter(mm, member) (mm)->_##member-- - -#endif /* !USE_SPLIT_PTLOCKS */ - -#define get_mm_rss(mm) \ - (get_mm_counter(mm, file_rss) + get_mm_counter(mm, anon_rss)) -#define update_hiwater_rss(mm) do { \ - unsigned long _rss = get_mm_rss(mm); \ - if ((mm)->hiwater_rss < _rss) \ - (mm)->hiwater_rss = _rss; \ -} while (0) -#define update_hiwater_vm(mm) do { \ - if ((mm)->hiwater_vm < (mm)->total_vm) \ - (mm)->hiwater_vm = (mm)->total_vm; \ -} while (0) - -static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm) -{ - return max(mm->hiwater_rss, get_mm_rss(mm)); -} - -static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, - struct mm_struct *mm) -{ - unsigned long hiwater_rss = get_mm_hiwater_rss(mm); - - if (*maxrss < hiwater_rss) - *maxrss = hiwater_rss; -} - -static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm) -{ - return max(mm->hiwater_vm, mm->total_vm); -} extern void set_dumpable(struct mm_struct *mm, int value); extern int get_dumpable(struct mm_struct *mm); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..7616bcf107b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048edf..0a67e041edf8 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * fill in basic accounting fields diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 1888b2d71bb8..78b94f0b6d5d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -194,7 +194,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85abbb39..46f5dacf90a2 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); } } else { if (!pte_file(pte)) diff --git a/mm/memory.c b/mm/memory.c index 72fb5f39bccc..c57678478801 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -121,6 +121,7 @@ static int __init init_zero_pfn(void) } core_initcall(init_zero_pfn); + /* * If a p?d_bad entry is found while walking page tables, report * the error, before resetting entry to p?d_none. Usually (but @@ -376,12 +377,18 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -632,7 +639,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (page) { get_page(page); page_dup_rmap(page); - rss[PageAnon(page)]++; + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: @@ -648,11 +658,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; @@ -688,7 +699,7 @@ again: arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap_nested(orig_src_pte); - add_mm_rss(dst_mm, rss[0], rss[1]); + add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); @@ -816,8 +827,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -863,14 +875,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -893,7 +905,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -1527,7 +1539,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -2163,11 +2175,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_FILEPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2604,7 +2616,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); @@ -2688,7 +2700,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); setpte: set_pte_at(mm, address, page_table, entry); @@ -2842,10 +2854,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 237050478f28..35755a4156d6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose) "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", task_pid_nr(p), p->comm, K(p->mm->total_vm), - K(get_mm_counter(p->mm, anon_rss)), - K(get_mm_counter(p->mm, file_rss))); + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); /* diff --git a/mm/rmap.c b/mm/rmap.c index 278cd277bdec..73d0472884c2 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -815,9 +815,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { if (PageAnon(page)) - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); set_pte_at(mm, address, pte, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { @@ -839,7 +839,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration @@ -857,7 +857,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); page_remove_rmap(page); page_cache_release(page); @@ -996,7 +996,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c0585b16418..893984946a2c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -840,7 +840,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); -- cgit v1.2.3 From b084d4353ff99d824d3bc5a5c2c22c70b1fba722 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:42 -0800 Subject: mm: count swap usage A frequent questions from users about memory management is what numbers of swap ents are user for processes. And this information will give some hints to oom-killer. Besides we can count the number of swapents per a process by scanning /proc//smaps, this is very slow and not good for usual process information handler which works like 'ps' or 'top'. (ps or top is now enough slow..) This patch adds a counter of swapents to mm_counter and update is at each swap events. Information is exported via /proc//status file as [kamezawa@bluextal memory]$ cat /proc/self/status Name: cat State: R (running) Tgid: 2910 Pid: 2910 PPid: 2823 TracerPid: 0 Uid: 500 500 500 500 Gid: 500 500 500 500 FDSize: 256 Groups: 500 VmPeak: 82696 kB VmSize: 82696 kB VmLck: 0 kB VmHWM: 432 kB VmRSS: 432 kB VmData: 172 kB VmStk: 84 kB VmExe: 48 kB VmLib: 1568 kB VmPTE: 40 kB VmSwap: 0 kB <=============== this. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Reviewed-by: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 2 ++ fs/proc/task_mmu.c | 9 ++++++--- include/linux/mm_types.h | 1 + mm/memory.c | 16 ++++++++++++---- mm/rmap.c | 1 + mm/swapfile.c | 1 + 6 files changed, 23 insertions(+), 7 deletions(-) (limited to 'fs/proc') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index e418f3d8f427..b5c5fc657a88 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -164,6 +164,7 @@ read the file /proc/PID/status: VmExe: 68 kB VmLib: 1412 kB VmPTE: 20 kb + VmSwap: 0 kB Threads: 1 SigQ: 0/28578 SigPnd: 0000000000000000 @@ -219,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries + VmSwap size of swap usage (the number of referred swapents) Threads number of threads SigQ number of signals queued/max. number for queue SigPnd bitmap of pending signals for the thread diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 375581276011..183f8ff5f400 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -16,7 +16,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib; + unsigned long data, text, lib, swap; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -36,6 +36,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -46,7 +47,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" - "VmPTE:\t%8lu kB\n", + "VmPTE:\t%8lu kB\n" + "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), (total_vm - mm->reserved_vm) << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), @@ -54,7 +56,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10); + (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + swap << (PAGE_SHIFT-10)); } unsigned long task_vsize(struct mm_struct *mm) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21861239ab0c..19549d7275ab 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -198,6 +198,7 @@ struct core_state { enum { MM_FILEPAGES, MM_ANONPAGES, + MM_SWAPENTS, NR_MM_COUNTERS }; diff --git a/mm/memory.c b/mm/memory.c index a4597614f18d..77d9f840936b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -679,7 +679,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } - if (is_write_migration_entry(entry) && + if (likely(!non_swap_entry(entry))) + rss[MM_SWAPENTS]++; + else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -974,9 +976,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); + } else { + swp_entry_t entry = pte_to_swp_entry(ptent); + + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); @@ -2692,6 +2699,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, */ inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); diff --git a/mm/rmap.c b/mm/rmap.c index 73d0472884c2..5cb47111f79e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -840,6 +840,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, spin_unlock(&mmlist_lock); } dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration diff --git a/mm/swapfile.c b/mm/swapfile.c index 893984946a2c..187a21f8b7bd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -840,6 +840,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, -- cgit v1.2.3 From d554ed895dc8f293cc712c71f14b101ace82579a Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 5 Mar 2010 13:42:42 -0800 Subject: fs: use rlimit helpers Make sure compiler won't do weird things with limits. E.g. fetching them twice may return 2 different values after writable limits are implemented. I.e. either use rlimit helpers added in commit 3e10e716abf3 ("resource: add helpers for fetching rlimits") or ACCESS_ONCE if not applicable. Signed-off-by: Jiri Slaby Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/attr.c | 2 +- fs/binfmt_aout.c | 2 +- fs/binfmt_flat.c | 2 +- fs/exec.c | 8 ++++---- fs/fcntl.c | 2 +- fs/file.c | 2 +- fs/proc/array.c | 4 ++-- fs/select.c | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/attr.c b/fs/attr.c index 0a6ea54cde7d..0815e93bb487 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -81,7 +81,7 @@ int inode_newsize_ok(const struct inode *inode, loff_t offset) if (inode->i_size < offset) { unsigned long limit; - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + limit = rlimit(RLIMIT_FSIZE); if (limit != RLIM_INFINITY && offset > limit) goto out_sig; if (offset > inode->i_sb->s_maxbytes) diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index fdd397099172..61dd00a6c7b4 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -247,7 +247,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (ex.a_data + ex.a_bss > rlim) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 42c6b4a54445..e0e769bdca59 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -501,7 +501,7 @@ static int load_flat_file(struct linux_binprm * bprm, * size limits imposed on them by creating programs with large * arrays in the data or bss. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim >= RLIM_INFINITY) rlim = ~0; if (data_len + bss_len > rlim) { diff --git a/fs/exec.c b/fs/exec.c index 591030735591..6348d79401de 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -195,7 +195,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * to work from. */ rlim = current->signal->rlim; - if (size > rlim[RLIMIT_STACK].rlim_cur / 4) { + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) { put_page(page); return NULL; } @@ -579,7 +579,7 @@ int setup_arg_pages(struct linux_binprm *bprm, #ifdef CONFIG_STACK_GROWSUP /* Limit stack size to 1GB */ - stack_base = current->signal->rlim[RLIMIT_STACK].rlim_max; + stack_base = rlimit_max(RLIMIT_STACK); if (stack_base > (1 << 30)) stack_base = 1 << 30; @@ -1535,7 +1535,7 @@ static int format_corename(char *corename, long signr) /* core limit size */ case 'c': rc = snprintf(out_ptr, out_end - out_ptr, - "%lu", current->signal->rlim[RLIMIT_CORE].rlim_cur); + "%lu", rlimit(RLIMIT_CORE)); if (rc > out_end - out_ptr) goto out; out_ptr += rc; @@ -1800,7 +1800,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct coredump_params cprm = { .signr = signr, .regs = regs, - .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + .limit = rlimit(RLIMIT_CORE), }; audit_core_dumps(signr); diff --git a/fs/fcntl.c b/fs/fcntl.c index 97e01dc0d95f..452d02f9075e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -344,7 +344,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, switch (cmd) { case F_DUPFD: case F_DUPFD_CLOEXEC: - if (arg >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (arg >= rlimit(RLIMIT_NOFILE)) break; err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0); if (err >= 0) { diff --git a/fs/file.c b/fs/file.c index 38039af67663..34bb7f71d994 100644 --- a/fs/file.c +++ b/fs/file.c @@ -257,7 +257,7 @@ int expand_files(struct files_struct *files, int nr) * N.B. For clone tasks sharing a files structure, this test * will limit the total number of files that can be opened. */ - if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (nr >= rlimit(RLIMIT_NOFILE)) return -EMFILE; /* Do we need to expand? */ diff --git a/fs/proc/array.c b/fs/proc/array.c index 18e20feee251..aa8637b81028 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -273,7 +273,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) rcu_read_lock(); /* FIXME: is this correct? */ qsize = atomic_read(&__task_cred(p)->user->sigpending); rcu_read_unlock(); - qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; + qlim = task_rlimit(p, RLIMIT_SIGPENDING); unlock_task_sighand(p, &flags); } @@ -420,7 +420,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = sig->cutime; cstime = sig->cstime; cgtime = sig->cgtime; - rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + rsslim = ACCESS_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur); /* add up live thread stats at the group level */ if (whole) { diff --git a/fs/select.c b/fs/select.c index fd38ce2e32e3..73715e90030f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -821,7 +821,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct poll_list *walk = head; unsigned long todo = nfds; - if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); -- cgit v1.2.3 From e17a5765f20d1219c3f05eb17aab11671978e0ec Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 5 Mar 2010 13:43:59 -0800 Subject: proc: do translation + unlink atomically at remove_proc_entry() remove_proc_entry() does lock lookup parent unlock lock unlink proc entry from lists unlock which can be made bit more correct by doing parent translation + unlink without dropping lock. Signed-off-by: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 9580abeadeb3..ce2d95477701 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -291,19 +291,17 @@ static const struct inode_operations proc_file_inode_operations = { * returns the struct proc_dir_entry for "/proc/tty/driver", and * returns "serial" in residual. */ -static int xlate_proc_name(const char *name, - struct proc_dir_entry **ret, const char **residual) +static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) { const char *cp = name, *next; struct proc_dir_entry *de; int len; - int rtn = 0; de = *ret; if (!de) de = &proc_root; - spin_lock(&proc_subdir_lock); while (1) { next = strchr(cp, '/'); if (!next) @@ -314,17 +312,24 @@ static int xlate_proc_name(const char *name, if (proc_match(len, cp, de)) break; } - if (!de) { - rtn = -ENOENT; - goto out; - } + if (!de) + return -ENOENT; cp += len + 1; } *residual = cp; *ret = de; -out: + return 0; +} + +static int xlate_proc_name(const char *name, struct proc_dir_entry **ret, + const char **residual) +{ + int rv; + + spin_lock(&proc_subdir_lock); + rv = __xlate_proc_name(name, ret, residual); spin_unlock(&proc_subdir_lock); - return rtn; + return rv; } static DEFINE_IDA(proc_inum_ida); @@ -797,11 +802,13 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) const char *fn = name; int len; - if (xlate_proc_name(name, &parent, &fn) != 0) + spin_lock(&proc_subdir_lock); + if (__xlate_proc_name(name, &parent, &fn) != 0) { + spin_unlock(&proc_subdir_lock); return; + } len = strlen(fn); - spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { if (proc_match(len, fn, *p)) { de = *p; -- cgit v1.2.3 From 12bac0d9f4dbf3445a0319beee848d15fa32775e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 5 Mar 2010 13:44:00 -0800 Subject: proc: warn on non-existing proc entries * warn if creation goes on to non-existent directory * warn if removal goes on from non-existing directory * warn if non-existing proc entry is removed Signed-off-by: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/generic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs/proc') diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ce2d95477701..08f4d71dacd7 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -312,8 +312,10 @@ static int __xlate_proc_name(const char *name, struct proc_dir_entry **ret, if (proc_match(len, cp, de)) break; } - if (!de) + if (!de) { + WARN(1, "name '%s'\n", name); return -ENOENT; + } cp += len + 1; } *residual = cp; @@ -818,8 +820,10 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) } } spin_unlock(&proc_subdir_lock); - if (!de) + if (!de) { + WARN(1, "name '%s'\n", name); return; + } spin_lock(&de->pde_unload_lock); /* -- cgit v1.2.3